konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,225 @@
1
+ """Batch job execution via k8s jobsets
2
+ https://jobset.sigs.k8s.io/
3
+ https://kueue.sigs.k8s.io/docs/tasks/run/jobsets/
4
+ """
5
+
6
+ import threading
7
+ import time
8
+ import typing
9
+ from typing import Any, Dict, Optional, Tuple
10
+
11
+ import colorama
12
+
13
+ if typing.TYPE_CHECKING:
14
+ import konduktor
15
+ from konduktor.data import storage as storage_lib
16
+
17
+ from konduktor import config, logging
18
+ from konduktor.backends import backend, jobset_utils, pod_utils
19
+ from konduktor.utils import kubernetes_utils, log_utils, rich_utils, ux_utils
20
+
21
+ Path = str
22
+ logger = logging.get_logger(__file__)
23
+
24
+ POLL_INTERVAL = 5
25
+ DEFAULT_ATTACH_TIMEOUT = 86400 # 1 day
26
+ FLUSH_LOGS_TIMEOUT = 5
27
+
28
+
29
+ class JobsetError(Exception):
30
+ pass
31
+
32
+
33
+ def _raise_job_error(job):
34
+ """Checks a jobs conditions and statuses for error"""
35
+ for condition in job.status.conditions:
36
+ if 'ConfigIssue' in condition.message:
37
+ raise ValueError(
38
+ 'Job failed with '
39
+ f'{colorama.Style.BRIGHT}{colorama.Fore.RED}'
40
+ f'ConfigIssue: ErrImagePull.{colorama.Style.RESET_ALL} '
41
+ f'Check that your '
42
+ f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
43
+ f'`image_id`{colorama.Style.RESET_ALL} is correct and '
44
+ f'your container credentials are correct. Image specified '
45
+ f'in your task definition is '
46
+ f'{colorama.Style.BRIGHT}{colorama.Fore.RED}'
47
+ f'`{job.spec.template.spec.containers[0].image}`'
48
+ f'{colorama.Style.RESET_ALL}'
49
+ )
50
+ elif 'BackoffLimitExceeded' == condition.reason:
51
+ raise JobsetError('Job failed with non-zero exit code.')
52
+ logger.error(
53
+ 'Job failed with unknown error. Check jobset status in k8s with '
54
+ f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
55
+ f'`kubectl get job -o yaml {job.metadata.name}`'
56
+ f'{colorama.Style.RESET_ALL}'
57
+ )
58
+
59
+
60
+ def _wait_for_jobset_start(namespace: str, job_name: str):
61
+ time.sleep(2)
62
+ start = time.time()
63
+ timeout = config.get_nested(
64
+ ('kubernetes', 'provision_timeout'),
65
+ default_value=DEFAULT_ATTACH_TIMEOUT,
66
+ )
67
+
68
+ while True:
69
+ jobsets = jobset_utils.get_jobset(namespace, job_name)
70
+ assert jobsets is not None, (
71
+ f'Jobset {job_name} ' f'not found in namespace {namespace}'
72
+ )
73
+ if 'status' in jobsets:
74
+ if jobsets['status']['replicatedJobsStatus'][0]['ready']:
75
+ logger.info(
76
+ f'task '
77
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
78
+ f'{colorama.Style.RESET_ALL} ready'
79
+ )
80
+ break
81
+ elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
82
+ return
83
+ elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
84
+ logger.info(
85
+ f'job '
86
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
87
+ f'{colorama.Style.RESET_ALL} '
88
+ f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
89
+ )
90
+ job = jobset_utils.get_job(namespace, job_name)
91
+ _raise_job_error(job)
92
+ return
93
+ if timeout != -1 and time.time() - start > timeout:
94
+ logger.error(
95
+ f'{colorama.Style.BRIGHT}'
96
+ f'{colorama.Fore.RED}Job timed out to schedule.'
97
+ f'{colorama.Style.RESET_ALL}. Deleting job'
98
+ )
99
+ jobset_utils.delete_jobset(namespace, job_name)
100
+ raise JobsetError(
101
+ 'Job failed to start within '
102
+ f'timeout of {timeout} seconds. '
103
+ f'Increase or disable timeout '
104
+ f'{colorama.Style.BRIGHT}'
105
+ '`konduktor.provision_timeout: -1`'
106
+ f'{colorama.Style.RESET_ALL}'
107
+ )
108
+ time.sleep(POLL_INTERVAL)
109
+
110
+
111
+ def _wait_for_jobset_completion(namespace: str, job_name: str) -> Tuple[bool, str]:
112
+ while True:
113
+ jobsets = jobset_utils.get_jobset(namespace, job_name)
114
+ assert jobsets is not None, (
115
+ f'Jobset {job_name} ' f'not found in namespace {namespace}'
116
+ )
117
+ if jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
118
+ msg = (
119
+ f'task '
120
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
121
+ f'{colorama.Style.RESET_ALL} {colorama.Fore.GREEN}'
122
+ f'{colorama.Style.BRIGHT}finished{colorama.Style.RESET_ALL}'
123
+ )
124
+ return True, msg
125
+ elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
126
+ msg = (
127
+ f'task '
128
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
129
+ f'{colorama.Style.RESET_ALL} {colorama.Fore.RED}'
130
+ f'{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
131
+ )
132
+ return False, msg
133
+ time.sleep(POLL_INTERVAL)
134
+
135
+
136
+ class JobsetBackend(backend.Backend):
137
+ def _sync_file_mounts(
138
+ self,
139
+ all_file_mounts: Optional[Dict[Path, Path]],
140
+ storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
141
+ ) -> None:
142
+ """Syncs files/directories to cloud storage before job launch.
143
+
144
+ This uploads any local files/dirs to cloud storage so they can be downloaded
145
+ by the pods when they start.
146
+ """
147
+ pass
148
+
149
+ def _sync_workdir(self, workdir: str) -> None:
150
+ """Syncs the working directory to cloud storage before job launch."""
151
+
152
+ pass
153
+
154
+ def _post_execute(self) -> None:
155
+ """
156
+ TODO(asaiacai): add some helpful messages/commands that a user can run
157
+ to inspect the status of their jobset.
158
+ """
159
+ pass
160
+
161
+ def _execute(
162
+ self, task: 'konduktor.Task', detach_run: bool = False, dryrun: bool = False
163
+ ) -> Optional[str]:
164
+ """Executes the task on the cluster. By creating a jobset
165
+
166
+ Returns:
167
+ Job id if the task is submitted to the cluster, None otherwise.
168
+ """
169
+
170
+ # we should consider just building an image with the cloud provider
171
+ # sdks baked in. These can initialize and pull files first before
172
+ # the working container starts.
173
+
174
+ # first define the pod spec then create the jobset definition
175
+ pod_spec = pod_utils.create_pod_spec(task)
176
+ context = kubernetes_utils.get_current_kube_config_context_name()
177
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
178
+ # TODO(asaiacai): need to set env variables in pod
179
+ jobset_response: Optional[Dict[str, Any]] = jobset_utils.create_jobset(
180
+ namespace,
181
+ task,
182
+ pod_spec['kubernetes']['pod_config'],
183
+ dryrun=dryrun,
184
+ )
185
+
186
+ if not dryrun and not detach_run:
187
+ with ux_utils.print_exception_no_traceback():
188
+ with rich_utils.safe_status(
189
+ ux_utils.spinner_message(
190
+ 'waiting for job to start. ' 'Press Ctrl+C to detach. \n'
191
+ )
192
+ ):
193
+ _wait_for_jobset_start(namespace, task.name)
194
+ try:
195
+ assert jobset_response is not None
196
+ log_thread = threading.Thread(
197
+ target=log_utils.tail_logs,
198
+ args=(task.name,),
199
+ daemon=True,
200
+ )
201
+ logger.info('streaming logs...')
202
+ log_thread.start()
203
+ is_success, msg = _wait_for_jobset_completion(namespace, task.name)
204
+ # give the job sometime to flush logs
205
+ log_thread.join(
206
+ timeout=config.get_nested(('logs', 'timeout'), 60.0)
207
+ )
208
+ if not is_success:
209
+ logger.error(msg)
210
+ else:
211
+ logger.info(msg)
212
+ except KeyboardInterrupt:
213
+ logger.info('detaching from log stream...')
214
+ except Exception as err:
215
+ logger.error(
216
+ f'Check if job resources are '
217
+ f'active/queued with '
218
+ f'{colorama.Style.BRIGHT}'
219
+ f'`konduktor status`'
220
+ f'{colorama.Style.RESET_ALL}'
221
+ )
222
+ raise JobsetError(f'error: {err}')
223
+ else:
224
+ logger.info('detaching from run.')
225
+ return task.name