konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,86 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Konduktor backend interface."""
14
+
15
+ import typing
16
+ from typing import Dict, Optional
17
+
18
+ if typing.TYPE_CHECKING:
19
+ from konduktor.data import storage as storage_lib
20
+
21
+ import konduktor
22
+ from konduktor.utils import ux_utils
23
+
24
+ Path = str
25
+
26
+
27
+ class Backend:
28
+ """Backend interface: handles provisioning, setup, and scheduling."""
29
+
30
+ # NAME is used to identify the backend class from cli/yaml.
31
+ NAME = 'backend'
32
+
33
+ # --- APIs ---
34
+ def check_resources_fit_cluster(self, task: 'konduktor.Task') -> bool:
35
+ """Check whether resources of the task are satisfied by cluster."""
36
+ raise NotImplementedError
37
+
38
+ def sync_workdir(self, workdir: Path) -> None:
39
+ return self._sync_workdir(workdir)
40
+
41
+ def sync_file_mounts(
42
+ self,
43
+ all_file_mounts: Optional[Dict[Path, Path]],
44
+ storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
45
+ ) -> None:
46
+ return self._sync_file_mounts(all_file_mounts, storage_mounts)
47
+
48
+ def add_storage_objects(self, task: 'konduktor.Task') -> None:
49
+ raise NotImplementedError
50
+
51
+ def execute(
52
+ self, task: 'konduktor.Task', detach_run: bool, dryrun: bool = False
53
+ ) -> Optional[str]:
54
+ """Execute the task on the cluster.
55
+
56
+ Returns:
57
+ Job id if the task is submitted to the cluster, None otherwise.
58
+ """
59
+ ux_utils.spinner_message('Submitting job')
60
+ return self._execute(task, detach_run, dryrun)
61
+
62
+ def post_execute(self) -> None:
63
+ """Post execute(): e.g., print helpful inspection messages."""
64
+ return self._post_execute()
65
+
66
+ def register_info(self, **kwargs) -> None:
67
+ """Register backend-specific information."""
68
+ pass
69
+
70
+ def _sync_workdir(self, workdir: Path) -> None:
71
+ raise NotImplementedError
72
+
73
+ def _sync_file_mounts(
74
+ self,
75
+ all_file_mounts: Optional[Dict[Path, Path]],
76
+ storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
77
+ ) -> None:
78
+ raise NotImplementedError
79
+
80
+ def _execute(
81
+ self, task: 'konduktor.Task', detach_run: bool, dryrun: bool = False
82
+ ) -> Optional[str]:
83
+ raise NotImplementedError
84
+
85
+ def _post_execute(self) -> None:
86
+ raise NotImplementedError
@@ -0,0 +1,21 @@
1
+ KONDUKTOR_SSH_PORT = 2222
2
+
3
+ # Common labels used across JobSets and Deployments
4
+ JOB_NAME_LABEL = 'trainy.ai/job-name'
5
+ DEPLOYMENT_NAME_LABEL = 'trainy.ai/deployment-name'
6
+ AIBRIX_NAME_LABEL = 'model.aibrix.ai/name'
7
+ USERID_LABEL = 'trainy.ai/user-id'
8
+ USER_LABEL = 'trainy.ai/username'
9
+ ACCELERATOR_LABEL = 'trainy.ai/accelerator'
10
+ NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
11
+ MAX_EXECUTION_TIME_LABEL = 'kueue.x-k8s.io/max-exec-time-seconds'
12
+
13
+ # Start/stop/status related labels
14
+ STOP_USERID_LABEL = 'trainy.ai/stop-userid'
15
+ STOP_USERNAME_LABEL = 'trainy.ai/stop-username'
16
+
17
+ # Secret labels
18
+ SECRET_BASENAME_LABEL = 'trainy.ai/secret-basename'
19
+ SECRET_KIND_LABEL = 'trainy.ai/secret-kind'
20
+ SECRET_OWNER_LABEL = 'trainy.ai/secret-owner'
21
+ ROOT_NAME = 'trainy.ai/root-name'
@@ -0,0 +1,204 @@
1
+ import time
2
+ import typing
3
+ from typing import Dict, Optional
4
+
5
+ import colorama
6
+
7
+ if typing.TYPE_CHECKING:
8
+ import konduktor
9
+ from konduktor.data import storage as storage_lib
10
+
11
+ from kubernetes.client.exceptions import ApiException
12
+
13
+ from konduktor import config, kube_client, logging
14
+ from konduktor.backends import backend, deployment_utils, pod_utils
15
+ from konduktor.utils import kubernetes_utils, rich_utils, ux_utils
16
+
17
+ Path = str
18
+ logger = logging.get_logger(__file__)
19
+
20
+ POLL_INTERVAL = 5
21
+ DEFAULT_ATTACH_TIMEOUT = 300
22
+
23
+
24
+ class DeploymentError(Exception):
25
+ pass
26
+
27
+
28
+ def _wait_for_all_ready(namespace: str, name: str):
29
+ """Wait for Deployment, Service, and Autoscaler readiness."""
30
+ time.sleep(2)
31
+ start = time.time()
32
+ timeout = config.get_nested(
33
+ ('kubernetes', 'provision_timeout'),
34
+ default_value=DEFAULT_ATTACH_TIMEOUT,
35
+ )
36
+
37
+ while True:
38
+ context = kubernetes_utils.get_current_kube_config_context_name()
39
+
40
+ # Directly read objects instead of listing everything
41
+ try:
42
+ deployment = kube_client.apps_api(context).read_namespaced_deployment(
43
+ name=name, namespace=namespace
44
+ )
45
+ deployments_map = {name: deployment}
46
+ except ApiException:
47
+ deployments_map = {}
48
+
49
+ try:
50
+ service = kube_client.core_api(context).read_namespaced_service(
51
+ name=name, namespace=namespace
52
+ )
53
+ services_map = {name: service}
54
+ except ApiException:
55
+ services_map = {}
56
+
57
+ autoscalers_map = {}
58
+ try:
59
+ autoscaler_obj = deployment_utils.get_autoscaler(namespace, name)
60
+ if autoscaler_obj:
61
+ # detect aibrix vs general from deployment labels
62
+ labels = (deployment.metadata.labels or {}) if deployment else {}
63
+ is_aibrix = deployment_utils.AIBRIX_NAME_LABEL in labels
64
+ if is_aibrix:
65
+ autoscalers_map[name] = {'kpa': autoscaler_obj}
66
+ else:
67
+ autoscalers_map[name] = {'hpa': autoscaler_obj}
68
+ except ApiException:
69
+ pass
70
+
71
+ status = deployment_utils.get_model_status(
72
+ name, deployments_map, services_map, autoscalers_map
73
+ )
74
+
75
+ is_ready = (
76
+ status['deployment'] == 'ready'
77
+ and status['service'] == 'ready'
78
+ and (status['autoscaler'] == 'ready' or status['autoscaler'] is None)
79
+ )
80
+
81
+ states = {
82
+ 'Deployment': status['deployment'],
83
+ 'Service': status['service'],
84
+ 'Autoscaler': status['autoscaler'],
85
+ }
86
+
87
+ # Figure out which components are missing
88
+ missing_parts = [name for name, state in states.items() if state == 'missing']
89
+
90
+ if missing_parts:
91
+ deployment_utils.delete_serving_specs(name, namespace)
92
+ missing_str = ', '.join(missing_parts)
93
+ raise DeploymentError(
94
+ f'Deployment failed. '
95
+ f'The following components are missing: {missing_str}.'
96
+ )
97
+
98
+ if is_ready:
99
+ logger.info(
100
+ f'task {colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
101
+ f'{name}{colorama.Style.RESET_ALL} ready'
102
+ )
103
+ return
104
+
105
+ if timeout != -1 and time.time() - start > timeout:
106
+ logger.error(
107
+ f'{colorama.Style.BRIGHT}{colorama.Fore.RED}'
108
+ f'Model timed out waiting for readiness.'
109
+ f'{colorama.Style.RESET_ALL}'
110
+ f'Final status:\n{status}'
111
+ )
112
+ deployment_utils.delete_serving_specs(name, namespace)
113
+ raise DeploymentError(
114
+ f'Model failed to become ready within {timeout} seconds.\n'
115
+ )
116
+
117
+ time.sleep(POLL_INTERVAL)
118
+
119
+
120
+ class DeploymentBackend(backend.Backend):
121
+ NAME = 'deployment'
122
+
123
+ def check_resources_fit_cluster(self, task: 'konduktor.Task') -> bool:
124
+ return True
125
+
126
+ def add_storage_objects(self, task: 'konduktor.Task') -> None:
127
+ pass
128
+
129
+ def register_info(self, **kwargs) -> None:
130
+ pass
131
+
132
+ def _sync_file_mounts(
133
+ self,
134
+ all_file_mounts: Optional[Dict[Path, Path]],
135
+ storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
136
+ ) -> None:
137
+ pass
138
+
139
+ def _sync_workdir(self, workdir: str) -> None:
140
+ pass
141
+
142
+ def _post_execute(self) -> None:
143
+ pass
144
+
145
+ def _execute(
146
+ self,
147
+ task: 'konduktor.Task',
148
+ detach_run: bool = False,
149
+ dryrun: bool = False,
150
+ ) -> Optional[str]:
151
+ """Execute a task by launching a long-running Deployment."""
152
+
153
+ pod_spec = pod_utils.create_pod_spec(task)
154
+ context = kubernetes_utils.get_current_kube_config_context_name()
155
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
156
+
157
+ if not dryrun and task.serving:
158
+ logger.debug(f'[DEBUG] Creating deployment for task: {task.name}')
159
+ deployment_utils.create_deployment(
160
+ namespace=namespace,
161
+ task=task,
162
+ pod_spec=pod_spec['kubernetes']['pod_config'],
163
+ dryrun=dryrun,
164
+ )
165
+
166
+ logger.debug(f'[DEBUG] Creating service for task: {task.name}')
167
+ deployment_utils.create_service(
168
+ namespace=namespace,
169
+ task=task,
170
+ dryrun=dryrun,
171
+ )
172
+
173
+ # Create podautoscaler for non-general deployments
174
+ logger.debug(f'[DEBUG] Creating podautoscaler for task: {task.name}')
175
+ deployment_utils.create_pod_autoscaler(
176
+ namespace=namespace,
177
+ task=task,
178
+ dryrun=dryrun,
179
+ )
180
+
181
+ # HTTP Add-on resources for general deployments
182
+ logger.debug(
183
+ f'[DEBUG] Creating HTTP Add-on resources for task: {task.name}'
184
+ )
185
+ deployment_utils.create_http_addon_resources(
186
+ namespace=namespace,
187
+ task=task,
188
+ dryrun=dryrun,
189
+ )
190
+
191
+ if not dryrun and not detach_run:
192
+ with ux_utils.print_exception_no_traceback():
193
+ with rich_utils.safe_status(
194
+ ux_utils.spinner_message('waiting for resources to be ready.\n')
195
+ ):
196
+ _wait_for_all_ready(namespace, task.name)
197
+ logger.info(
198
+ f"Model '{task.name}' is ready. "
199
+ f'Run `konduktor serve status` for details.'
200
+ )
201
+ else:
202
+ logger.info('detaching from run.')
203
+
204
+ return task.name