konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,461 @@
1
+ """This module contains a custom validator for the JSON Schema specification.
2
+
3
+ The main motivation behind extending the existing JSON Schema validator is to
4
+ allow for case-insensitive enum matching since this is currently not supported
5
+ by the JSON Schema specification.
6
+ """
7
+
8
+ import base64
9
+ import json
10
+ import os
11
+ import re
12
+ import subprocess
13
+ import time
14
+ from pathlib import Path
15
+ from typing import Tuple
16
+
17
+ import jsonschema
18
+ import requests
19
+ from colorama import Fore, Style
20
+ from filelock import FileLock
21
+
22
+ from konduktor import logging
23
+
24
+ SCHEMA_VERSION = 'v1.32.0-standalone-strict'
25
+ SCHEMA_CACHE_PATH = Path.home() / '.konduktor/schemas'
26
+ SCHEMA_LOCK_PATH = SCHEMA_CACHE_PATH / '.lock'
27
+ CACHE_MAX_AGE_SECONDS = 86400 # 24 hours
28
+
29
+ # Schema URLs for different Kubernetes resources
30
+ SCHEMA_URLS = {
31
+ 'podspec': f'https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/{SCHEMA_VERSION}/podspec.json',
32
+ 'deployment': f'https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/{SCHEMA_VERSION}/deployment.json',
33
+ 'service': f'https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/{SCHEMA_VERSION}/service.json',
34
+ 'horizontalpodautoscaler': f'https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/{SCHEMA_VERSION}/horizontalpodautoscaler-autoscaling-v2.json',
35
+ }
36
+
37
+ logger = logging.get_logger(__name__)
38
+
39
+
40
+ def _skip_image_checks() -> bool:
41
+ val = os.getenv('KONDUKTOR_SKIP_IMAGE_CHECK', '')
42
+ return val.lower() in ('1', 'true', 'yes', 'y')
43
+
44
+
45
+ def case_insensitive_enum(validator, enums, instance, schema):
46
+ del validator, schema # Unused.
47
+ if instance.lower() not in [enum.lower() for enum in enums]:
48
+ yield jsonschema.ValidationError(f'{instance!r} is not one of {enums!r}')
49
+
50
+
51
+ SchemaValidator = jsonschema.validators.extend(
52
+ jsonschema.Draft7Validator,
53
+ validators={'case_insensitive_enum': case_insensitive_enum},
54
+ )
55
+
56
+
57
+ def get_cached_schema(schema_type: str) -> dict:
58
+ """Get cached schema for a specific Kubernetes resource type."""
59
+ schema_url = SCHEMA_URLS.get(schema_type)
60
+ if not schema_url:
61
+ raise ValueError(f'Unknown schema type: {schema_type}')
62
+
63
+ schema_file = SCHEMA_CACHE_PATH / f'{schema_type}.json'
64
+ lock = FileLock(str(SCHEMA_LOCK_PATH))
65
+
66
+ with lock:
67
+ # Check if schema file exists and is fresh
68
+ if schema_file.exists():
69
+ age = time.time() - schema_file.stat().st_mtime
70
+ # if fresh
71
+ if age < CACHE_MAX_AGE_SECONDS:
72
+ with open(schema_file, 'r') as f:
73
+ return json.load(f)
74
+
75
+ # Download schema
76
+ resp = requests.get(schema_url)
77
+ resp.raise_for_status()
78
+
79
+ SCHEMA_CACHE_PATH.mkdir(parents=True, exist_ok=True)
80
+ with open(schema_file, 'w') as f:
81
+ f.write(resp.text)
82
+
83
+ return resp.json()
84
+
85
+
86
+ def _validate_k8s_spec(spec: dict, schema_type: str, resource_name: str) -> None:
87
+ """Generic validation function for Kubernetes specs."""
88
+ schema = get_cached_schema(schema_type)
89
+
90
+ validator = jsonschema.Draft7Validator(schema)
91
+ errors = sorted(validator.iter_errors(spec), key=lambda e: e.path)
92
+
93
+ if not errors:
94
+ return
95
+
96
+ formatted = [
97
+ f'- {error.message}'
98
+ + (f" at path: {' → '.join(str(p) for p in error.path)}" if error.path else '')
99
+ for error in errors
100
+ ]
101
+
102
+ # Clean log
103
+ logger.debug('Invalid k8s %s spec/config:\n%s', resource_name, '\n'.join(formatted))
104
+
105
+ # Color only in CLI
106
+ formatted_colored = [
107
+ f'{Fore.RED}- {error.message}'
108
+ + (f" at path: {' → '.join(str(p) for p in error.path)}" if error.path else '')
109
+ + Style.RESET_ALL
110
+ for error in errors
111
+ ]
112
+
113
+ raise ValueError(
114
+ f'\n{Fore.RED}Invalid k8s {resource_name} spec/config: {Style.RESET_ALL}\n'
115
+ + '\n'.join(formatted_colored)
116
+ )
117
+
118
+
119
+ def validate_pod_spec(pod_spec: dict) -> None:
120
+ """Validate a Kubernetes pod spec."""
121
+ _validate_k8s_spec(pod_spec, 'podspec', 'pod')
122
+
123
+
124
+ def validate_deployment_spec(deployment_spec: dict) -> None:
125
+ """Validate a Kubernetes deployment spec."""
126
+ _validate_k8s_spec(deployment_spec, 'deployment', 'deployment')
127
+
128
+
129
+ def validate_service_spec(service_spec: dict) -> None:
130
+ """Validate a Kubernetes service spec."""
131
+ _validate_k8s_spec(service_spec, 'service', 'service')
132
+
133
+
134
+ def validate_horizontalpodautoscaler_spec(hpa_spec: dict) -> None:
135
+ """Validate a Kubernetes HorizontalPodAutoscaler spec."""
136
+ _validate_k8s_spec(hpa_spec, 'horizontalpodautoscaler', 'horizontalpodautoscaler')
137
+
138
+
139
+ def validate_docker_image(image_id: str) -> Tuple[str, str]:
140
+ """Validate if a Docker image exists and is accessible.
141
+
142
+ Args:
143
+ image_id: The Docker image ID to validate
144
+ (e.g., 'ubuntu:latest', 'gcr.io/project/image:tag')
145
+
146
+ Returns:
147
+ Tuple of (status, message) where status is:
148
+ - 'valid': Image definitely exists
149
+ - 'warning': Couldn't validate, but might be valid
150
+ - 'invalid': Image definitely doesn't exist
151
+ """
152
+ if not image_id or not isinstance(image_id, str):
153
+ return 'invalid', 'Image ID must be a non-empty string'
154
+
155
+ # Basic format validation
156
+ if not _is_valid_docker_image_format(image_id):
157
+ return 'invalid', f'Invalid Docker image format: {image_id}'
158
+
159
+ # Try registry API validation first (works without Docker daemon)
160
+ registry_result = _validate_image_in_registry(image_id)
161
+ if registry_result[0] in ['valid', 'invalid']:
162
+ return registry_result
163
+
164
+ # If registry validation couldn't determine, try local Docker as fallback
165
+ if _can_pull_image_locally(image_id):
166
+ return 'valid', f"Docker image '{image_id}' validated locally"
167
+
168
+ # Return the registry result (warning)
169
+ return registry_result
170
+
171
+
172
+ def _is_valid_docker_image_format(image_id: str) -> bool:
173
+ """Check if the image ID follows valid Docker image naming conventions."""
174
+ # Basic regex for Docker image names
175
+ # Supports: name:tag, registry/name:tag, registry/namespace/name:tag
176
+ pattern = (
177
+ r'^[a-zA-Z0-9][a-zA-Z0-9._-]*'
178
+ r'(?:\/[a-zA-Z0-9][a-zA-Z0-9._-]*)*'
179
+ r'(?::[a-zA-Z0-9._-]+)?$'
180
+ )
181
+ return bool(re.match(pattern, image_id))
182
+
183
+
184
+ def _can_pull_image_locally(image_id: str) -> bool:
185
+ """Try to inspect the image manifest locally to check if it exists."""
186
+ try:
187
+ # Use docker manifest inspect instead of pull for faster validation
188
+ result = subprocess.run(
189
+ ['docker', 'manifest', 'inspect', image_id],
190
+ capture_output=True,
191
+ text=True,
192
+ timeout=30, # 30 second timeout
193
+ )
194
+
195
+ # Debug logging
196
+ logger.debug(
197
+ f'Local Docker manifest inspect for {image_id}: '
198
+ f'returncode={result.returncode}, '
199
+ f"stdout='{result.stdout}', "
200
+ f"stderr='{result.stderr}'"
201
+ )
202
+
203
+ return result.returncode == 0
204
+ except (
205
+ subprocess.TimeoutExpired,
206
+ FileNotFoundError,
207
+ subprocess.SubprocessError,
208
+ ) as e:
209
+ # Docker not available or timeout
210
+ logger.debug(f'Local Docker manifest inspect failed for {image_id}: {e}')
211
+ return False
212
+
213
+
214
+ def _validate_image_in_registry(image_id: str) -> Tuple[str, str]:
215
+ """Validate image exists in registry using API calls."""
216
+ try:
217
+ registry, repo, tag = _parse_image_components(image_id)
218
+
219
+ if registry == 'docker.io':
220
+ return _validate_dockerhub_image(repo, tag)
221
+ elif registry.endswith('gcr.io'):
222
+ return _validate_gcr_image(registry, repo, tag)
223
+ elif registry.endswith('ecr.') and '.amazonaws.com' in registry:
224
+ return _validate_ecr_image(registry, repo, tag)
225
+ elif registry == 'nvcr.io':
226
+ return _validate_nvcr_image(registry, repo, tag)
227
+ elif registry == 'ghcr.io':
228
+ return _validate_ghcr_image(registry, repo, tag)
229
+ elif registry == 'quay.io':
230
+ return _validate_quay_image(registry, repo, tag)
231
+ else:
232
+ # For other registries, we can't easily validate without credentials
233
+ # Return warning that we couldn't verify
234
+ return (
235
+ 'warning',
236
+ f"Could not validate '{image_id}' in registry {registry} "
237
+ f'(not supported)',
238
+ )
239
+
240
+ except Exception as e:
241
+ logger.debug(f'Error validating image {image_id}: {e}')
242
+ return 'warning', f"Could not validate '{image_id}' due to validation error"
243
+
244
+
245
+ def _parse_image_components(image_id: str) -> Tuple[str, str, str]:
246
+ """Parse image ID into registry, repository, and tag components."""
247
+ # Default to Docker Hub
248
+ if '/' not in image_id or '.' not in image_id.split('/')[0]:
249
+ registry = 'docker.io'
250
+ # For Docker Hub official images (single word), add 'library/' prefix
251
+ if ':' in image_id:
252
+ repo, tag = image_id.rsplit(':', 1)
253
+ else:
254
+ repo = image_id
255
+ tag = 'latest'
256
+ # Only add 'library/' prefix for single-word official images
257
+ if '/' not in repo:
258
+ repo = f'library/{repo}'
259
+ else:
260
+ parts = image_id.split('/')
261
+ if '.' in parts[0] or parts[0] in ['localhost']:
262
+ registry = parts[0]
263
+ repo = '/'.join(parts[1:])
264
+ else:
265
+ registry = 'docker.io'
266
+ repo = image_id
267
+
268
+ # Split repository and tag
269
+ if ':' in repo:
270
+ repo, tag = repo.rsplit(':', 1)
271
+ else:
272
+ tag = 'latest'
273
+
274
+ return registry, repo, tag
275
+
276
+
277
+ def _validate_dockerhub_image(repo: str, tag: str) -> Tuple[str, str]:
278
+ """Validate image exists in Docker Hub using the official API."""
279
+ try:
280
+ # Use Docker Hub's official API v2 endpoint
281
+ # This endpoint checks if a specific tag exists for a repository
282
+ url = f'https://registry.hub.docker.com/v2/repositories/{repo}/tags/{tag}'
283
+
284
+ # Add User-Agent to avoid being blocked
285
+ headers = {'User-Agent': 'Konduktor-Docker-Validator/1.0'}
286
+
287
+ response = requests.get(url, headers=headers, timeout=10)
288
+
289
+ if response.status_code == 200:
290
+ return 'valid', f"Docker image '{repo}:{tag}' validated via Docker Hub"
291
+ else:
292
+ # API error, can't determine
293
+ return ('warning', f"Could not validate '{repo}:{tag}' in Docker Hub")
294
+
295
+ except requests.RequestException:
296
+ # Network error, can't determine
297
+ return (
298
+ 'warning',
299
+ f"Could not validate '{repo}:{tag}' in Docker Hub " f'(network error)',
300
+ )
301
+
302
+
303
+ def _validate_gcr_image(registry: str, repo: str, tag: str) -> Tuple[str, str]:
304
+ """Validate image exists in Google Container Registry."""
305
+ try:
306
+ # GCR manifest endpoint
307
+ url = f'https://{registry}/v2/{repo}/manifests/{tag}'
308
+ response = requests.get(url, timeout=10)
309
+
310
+ if response.status_code == 200:
311
+ return 'valid', f"Docker image '{repo}:{tag}' validated via {registry}"
312
+ else:
313
+ # API error, can't determine
314
+ return ('warning', f"Could not validate '{repo}:{tag}' in {registry} ")
315
+
316
+ except requests.RequestException:
317
+ # Network error, can't determine
318
+ return (
319
+ 'warning',
320
+ f"Could not validate '{repo}:{tag}' in {registry} " f'(network error)',
321
+ )
322
+
323
+
324
+ def _validate_ecr_image(registry: str, repo: str, tag: str) -> Tuple[str, str]:
325
+ """Validate image exists in Amazon ECR."""
326
+ # ECR requires AWS credentials and is complex to validate
327
+ # For now, return warning that we couldn't verify
328
+ return ('warning', f"Could not validate '{repo}:{tag}' in {registry}")
329
+
330
+
331
+ def _validate_nvcr_image(registry: str, repo: str, tag: str) -> Tuple[str, str]:
332
+ """Validate image exists in NVIDIA Container Registry."""
333
+ # NVCR requires NVIDIA credentials and is complex to validate
334
+ # For now, return warning that we couldn't verify
335
+ return ('warning', f"Could not validate '{repo}:{tag}' in {registry}")
336
+
337
+
338
+ def _validate_ghcr_image(registry: str, repo: str, tag: str) -> Tuple[str, str]:
339
+ """Validate image exists in GitHub Container Registry."""
340
+ try:
341
+ # Check if GITHUB_TOKEN is available
342
+ github_token = os.environ.get('GITHUB_TOKEN')
343
+
344
+ # If not in environment, try to get from konduktor secrets
345
+ if not github_token:
346
+ try:
347
+ # these imports are inside the try block to avoid circular import error
348
+ from konduktor.backends import constants as backend_constants
349
+ from konduktor.utils import common_utils, kubernetes_utils
350
+
351
+ context = kubernetes_utils.get_current_kube_config_context_name()
352
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
353
+ user_hash = common_utils.get_user_hash()
354
+ label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
355
+ user_secrets = kubernetes_utils.list_secrets(
356
+ namespace, context, label_filter=label_selector
357
+ )
358
+
359
+ for secret in user_secrets:
360
+ kind = kubernetes_utils.get_secret_kind(secret)
361
+ if kind == 'env' and secret.data and 'GITHUB_TOKEN' in secret.data:
362
+ # Decode the base64 encoded token
363
+ github_token = base64.b64decode(
364
+ secret.data['GITHUB_TOKEN']
365
+ ).decode()
366
+ logger.debug('GITHUB_TOKEN found in konduktor secret')
367
+ break
368
+
369
+ except Exception as e:
370
+ logger.debug(f'Failed to check konduktor secrets: {e}')
371
+
372
+ if not github_token:
373
+ return (
374
+ 'warning',
375
+ 'GITHUB_TOKEN unset, cannot verify this image. '
376
+ 'To enable validation, either:\n'
377
+ ' 1. Set GITHUB_TOKEN locally: export GITHUB_TOKEN=<token>\n'
378
+ ' 2. Create a secret: konduktor secret create --kind=env '
379
+ '--inline GITHUB_TOKEN=<token> <name>\n'
380
+ 'See: https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry',
381
+ )
382
+
383
+ # Base64 encode the token
384
+ ghcr_token = base64.b64encode(github_token.encode()).decode()
385
+
386
+ # GHCR manifest endpoint
387
+ url = f'https://{registry}/v2/{repo}/manifests/{tag}'
388
+ headers = {'Authorization': f'Bearer {ghcr_token}'}
389
+ response = requests.get(url, headers=headers, timeout=10)
390
+
391
+ if response.status_code == 200:
392
+ return 'valid', f"Docker image '{repo}:{tag}' validated via {registry}"
393
+ else:
394
+ # API error, can't determine
395
+ return ('warning', f"Could not validate '{repo}:{tag}' in {registry}")
396
+
397
+ except requests.RequestException:
398
+ # Network error, can't determine
399
+ return (
400
+ 'warning',
401
+ f"Could not validate '{repo}:{tag}' in {registry} " f'(network error)',
402
+ )
403
+
404
+
405
+ def _validate_quay_image(registry: str, repo: str, tag: str) -> Tuple[str, str]:
406
+ """Validate image exists in Quay.io Container Registry."""
407
+ # Quay.io requires authentication and is complex to validate
408
+ # For now, return warning that we couldn't verify
409
+ return ('warning', f"Could not validate '{repo}:{tag}' in {registry}")
410
+
411
+
412
+ # Track which images we've already warned about to avoid duplicate warnings
413
+ _warned_images = set()
414
+
415
+
416
+ def validate_and_warn_image(image_id: str, context: str = 'task') -> None:
417
+ """Validate Docker image and show appropriate warnings.
418
+
419
+ Args:
420
+ image_id: The Docker image ID to validate
421
+ context: Context for the validation (e.g., "task", "deployment")
422
+
423
+ """
424
+ if not image_id:
425
+ return
426
+
427
+ if _skip_image_checks():
428
+ logger.info(
429
+ 'Skipping Docker image validation for %s',
430
+ image_id,
431
+ )
432
+ return
433
+
434
+ status, message = validate_docker_image(image_id)
435
+
436
+ if status == 'invalid':
437
+ # Invalid images should fail - they definitely don't exist
438
+ raise ValueError(
439
+ f'{message}\n'
440
+ f'This Docker image does not exist and will cause the {context} to fail.\n'
441
+ f"Please check that the image '{image_id}' is correct and accessible.\n"
442
+ )
443
+ elif status == 'warning':
444
+ # Only warn once per image per session for warnings
445
+ if image_id not in _warned_images:
446
+ _warned_images.add(image_id)
447
+
448
+ logger.warning(
449
+ f'⚠️ Basic public image validation using Docker Daemon failed. ⚠️\n'
450
+ f'⚠️ {message} ⚠️\n'
451
+ f'⚠️ The {context} will be submitted anyway, but may be stuck '
452
+ f'PENDING forever. ⚠️\n'
453
+ f"⚠️ Check for 'ErrImagePull' or 'ImagePullBackOff' in "
454
+ f'kubectl get pods if issues occur. ⚠️'
455
+ )
456
+
457
+ # Add info about private registries
458
+ logger.info(
459
+ '⚠️ If pulling from a private registry, using ecr/nvcr, or not '
460
+ 'logged into Docker, this is safe to ignore. ⚠️'
461
+ )
@@ -0,0 +1,91 @@
1
+ # Trainy Software License
2
+ ## Version 1.0
3
+
4
+ Last Updated: June 12, 2024
5
+ Copyright © 2024 Trainy, Inc.
6
+
7
+ This Trainy Software License (“License”), effective the date we first make the Software available to you (“Effective Date”) is between the individual or legal entity exercising permissions granted by this License (“you” or “your”) and Trainy, Inc. (“Trainy” or “we” or “us” or “our”) concerning your use of the work of authorship, whether in source or object form, made available under this License, as indicated by a copyright notice that is included in or attached to the work (the “Software”).
8
+
9
+ # Authority
10
+ If you use the Software on behalf of another person or legal entity, (a) all references to “you” throughout this License will include that person or legal entity, (b) you represent that you are authorized to accept this License on that person’s or legal entity’s behalf, and (c) in the event you or the person or legal entity violates this License, the person or legal entity agrees to be responsible to us.
11
+
12
+ # Licenses.
13
+
14
+ ## Copyright License.
15
+
16
+ Subject to your compliance with this License, Trainy hereby grants you a non-exclusive, royalty-free copyright license to reproduce, modify, create derivative works, publicly perform, publicly display and redistribute the Software for any purpose that is not a Restriction (as defined below).
17
+
18
+ ## Limited Patent License.
19
+ Subject to your compliance with this License, the license grant above includes a license under our Applicable Patents. Notwithstanding the foregoing, If you initiate any claim, action or proceeding against any third party (including cross-claims or counterclaims) alleging that all or any portion of the Software infringes the patents of any entity or other person, the license granted under this Section 2.b will immediately terminate automatically without further action by either party. “Applicable Patents” are any of patents licensable by us that will be necessarily infringed by using the Software.
20
+
21
+ ## Restrictions.
22
+ A “Restriction” means distributing or otherwise commercially making the Software available in a manner that provides functionality to address the same or substantially similar user requirements as the Software or includes the same or similar functionality as the Software. Without limiting the foregoing, you may use the Software for your internal purposes and for non-commercial purposes. We grant you no rights except as expressly set forth in this License.
23
+
24
+ ## Derivatives; Marking
25
+ If you distribute any copies, improvements, modifications or derivatives (“Derivatives”) of the Software, this License will apply to you must include a copy of or a link to this License and include with such Derivatives any copyright, patent, trademark and attribution notices provided in or with the Software. You may reproduce our trademarks, trade names, service marks and other similar indicia (“Trademarks”) solely to the extent necessary to reproduce and/or link to applicable notices as provided in this Section 4. Except as provided in the prior sentence, we grant you no rights in or to our Trademarks, whether expressly, by implication, estoppel or otherwise.
26
+
27
+ ## Disclaimer.
28
+ THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTIES OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, TITLE OR NON-INFRINGEMENT. IN NO EVENT WILL WE HAVE ANY LIABILITY TO YOU ARISING OUT OF OR RELATED TO THE SOFTWARE, INCLUDING INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES, EVEN IF WE HAVE BEEN INFORMED OF THEIR POSSIBILITY IN ADVANCE.
29
+
30
+ ## Apache License.
31
+ Subject to your compliance with this License, Trainy hereby grants you a license to use the Software under the Apache License, Version 2.0, currently available at http://www.apache.org/licenses/LICENSE-2.0 (the “Apache License”), provided that you hereby covenant and agree not to use the Software under the Apache License until the third anniversary of the Effective Date. On or after such date, you may continue to use the Software under the terms of this License, or you may use the Software under the terms and conditions of, and in compliance with, the Apache License.
32
+
33
+ --------------------------------------------------------------------------------
34
+
35
+ The following files were modified from:
36
+
37
+ Code in manifests/kube-prometheus-stack.values modified from
38
+ https://github.com/prometheus-community/helm-charts
39
+ Git Revision: db15dcf70d06da9c71454f7d9757b47b04d2c5ae
40
+
41
+ Copyright 2024 Prometheus community Helm charts developers
42
+
43
+ Code in manifests/kube-prometheus-stack.values modified from
44
+ https://github.com/prometheus-community/helm-charts
45
+ Git Revision: db15dcf70d06da9c71454f7d9757b47b04d2c5ae
46
+
47
+ Code in:
48
+ manifests/manifests.yaml
49
+ manifests/single-clusterqueue-setup.yaml
50
+ modified from
51
+ https://github.com/kubernetes-sigs/kueue/
52
+ Git Revision: bf050331ad8917afa3e3ddc92e3c8153b2a654d2
53
+
54
+ Copyright 2022 The Kubernetes Authors
55
+
56
+ Code in grafana/ modified from
57
+ https://github.com/NVIDIA/dcgm-exporter/
58
+ Git Revision: 478fab1573c7a1f0db4358357733967e4ec52200
59
+
60
+ Copyright 2022 NVIDIA
61
+
62
+ The original files are licensed under the Apache License, Version 2.0 (the "License");
63
+ you may not use this file except in compliance with the License.
64
+ You may obtain a copy of the License at
65
+
66
+ http://www.apache.org/licenses/LICENSE-2.0
67
+
68
+ Unless required by applicable law or agreed to in writing, software
69
+ distributed under the License is distributed on an "AS IS" BASIS,
70
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
71
+ See the License for the specific language governing permissions and
72
+ limitations under the License.
73
+
74
+ The modifications are proprietary and subject to the terms of the Trainy Software License Version 1.0
75
+ Copyright 2024 Trainy Inc.
76
+
77
+ Code is modified from https://github.com/skypilot-org/skypilot
78
+
79
+ The original files are licensed under the Apache License, Version 2.0 (the "License");
80
+ you may not use this file except in compliance with the License.
81
+ You may obtain a copy of the License at
82
+
83
+ http://www.apache.org/licenses/LICENSE-2.0
84
+
85
+ Unless required by applicable law or agreed to in writing, software
86
+ distributed under the License is distributed on an "AS IS" BASIS,
87
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
88
+ See the License for the specific language governing permissions and
89
+ limitations under the License.
90
+ The modifications are proprietary and subject to the terms of the Trainy Software License Version 1.0
91
+ Copyright 2024 Trainy Inc.
@@ -0,0 +1,98 @@
1
+ Metadata-Version: 2.3
2
+ Name: konduktor-nightly
3
+ Version: 0.1.0.dev20251128104812
4
+ Summary: GPU Cluster Health Management
5
+ Author: Andrew Aikawa
6
+ Author-email: asai@berkeley.edu
7
+ Requires-Python: >=3.9,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Provides-Extra: s3
15
+ Requires-Dist: awscli[s3] (>=1.32.84,<2.0.0) ; extra == "s3"
16
+ Requires-Dist: boto3[s3] (>=1.34.84,<2.0.0) ; extra == "s3"
17
+ Requires-Dist: botocore[s3] (>=1.34.84,<2.0.0) ; extra == "s3"
18
+ Requires-Dist: click (>=8.1.7,<9.0.0)
19
+ Requires-Dist: colorama (>=0.4.6,<0.5.0)
20
+ Requires-Dist: filelock (>=3.18.0,<4.0.0)
21
+ Requires-Dist: google-api-python-client[gcp] (>=2.161.0,<3.0.0)
22
+ Requires-Dist: google-cloud-storage[gcp] (>=3.0.0,<4.0.0)
23
+ Requires-Dist: jinja2 (>=3.1.5,<4.0.0)
24
+ Requires-Dist: jsonschema (>=4.23.0,<5.0.0)
25
+ Requires-Dist: kr8s (>=0.20.1,<0.21.0)
26
+ Requires-Dist: kubernetes (>=30.1.0,<31.0.0)
27
+ Requires-Dist: posthog (>=3.7.4,<4.0.0)
28
+ Requires-Dist: prettytable (>=3.12.0,<4.0.0)
29
+ Requires-Dist: psutil (>=7.0.0,<8.0.0)
30
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
31
+ Requires-Dist: rich (>=13.9.4,<14.0.0)
32
+ Requires-Dist: websockets (>=15.0.1,<16.0.0)
33
+ Description-Content-Type: text/markdown
34
+
35
+
36
+ <p align="center">
37
+ <picture>
38
+ <img alt="Trainy Konduktor Logo" src="https://raw.githubusercontent.com/Trainy-ai/konduktor/main/docs/source/images/konduktor-logo-white-no-background.png" width="353" height="64" style="max-width: 100%;">
39
+ </picture>
40
+ <br/>
41
+ <br/>
42
+ </p>
43
+
44
+ Built on [Kubernetes](https://kubernetes.io). Konduktor uses existing open source tools to build a platform that makes it easy for ML Researchers to submit batch jobs and for administrative/infra teams to easily manage GPU clusters.
45
+
46
+ ## How it works
47
+
48
+ Konduktor uses a combination of open source projects. Where tools exist with MIT, Apache, or another compatible open license, we want to use and even contribute to that tool. Where we see gaps in tooling, we build it.
49
+
50
+ ### Architecture
51
+
52
+ Konduktor can be self-hosted and run on any certified Kubernetes distribution or managed by us. Contact us at founders@trainy.ai if you are just interested in the managed version. We're focused on tooling for clusters with NVIDIA cards for now but in the future we may expand to our scope to support other accelerators.
53
+
54
+ <p align="center">
55
+ <img alt="architecture" src="https://raw.githubusercontent.com/Trainy-ai/konduktor/main/docs/source/images/architecture.png" width=80%>
56
+ </p>
57
+
58
+ For ML researchers
59
+ - Konduktor CLI & SDK - user friendly batch job framework, where users only need to specify the resource requirements of their job and a script to launch that makes simple to scale work across multiple nodes. Works with most ML application frameworks out of the box.
60
+
61
+ ```
62
+ num_nodes: 100
63
+
64
+ resources:
65
+ accelerators: H100:8
66
+ cloud: kubernetes
67
+ labels:
68
+ kueue.x-k8s.io/queue-name: user-queue
69
+ kueue.x-k8s.io/priority-class: low-priority
70
+
71
+ run: |
72
+ torchrun \
73
+ --nproc_per_node 8 \
74
+ --rdzv_id=1 --rdzv_endpoint=$master_addr:1234 \
75
+ --rdzv_backend=c10d --nnodes $num_nodes \
76
+ torch_ddp_benchmark.py --distributed-backend nccl
77
+ ```
78
+
79
+ For cluster administrators
80
+ - [DCGM Exporter](https://github.com/NVIDIA/dcgm-exporter), [GPU operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/), [Network Operator](https://github.com/Mellanox/network-operator) - For installing NVIDIA driver, container runtime, and exporting node health metrics.
81
+ - [Kueue](https://kueue.sigs.k8s.io/docs/) - centralized creation of job queues, gang-scheduling, and resource quotas and sharing across projects.
82
+ - [Prometheus](https://prometheus.io/) - For publishing metrics about node health and workload queues.
83
+ - [OpenTelemetry](https://opentelemetry.io/) - For pushing logs from each node
84
+ - [Grafana, Loki](https://grafana.com/) - Visualizations for metrics/logging solution.
85
+
86
+
87
+ ## Community & Support
88
+ - [Discord](https://discord.com/invite/HQUBJSVgAP)
89
+ - founders@trainy.ai
90
+
91
+ ## Contributor Guide
92
+
93
+ Format your code with
94
+ ```
95
+ poetry install --with dev
96
+ bash format.sh
97
+ ```
98
+