konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,461 @@
|
|
|
1
|
+
"""This module contains a custom validator for the JSON Schema specification.
|
|
2
|
+
|
|
3
|
+
The main motivation behind extending the existing JSON Schema validator is to
|
|
4
|
+
allow for case-insensitive enum matching since this is currently not supported
|
|
5
|
+
by the JSON Schema specification.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import base64
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
import subprocess
|
|
13
|
+
import time
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
|
|
17
|
+
import jsonschema
|
|
18
|
+
import requests
|
|
19
|
+
from colorama import Fore, Style
|
|
20
|
+
from filelock import FileLock
|
|
21
|
+
|
|
22
|
+
from konduktor import logging
|
|
23
|
+
|
|
24
|
+
SCHEMA_VERSION = 'v1.32.0-standalone-strict'
|
|
25
|
+
SCHEMA_CACHE_PATH = Path.home() / '.konduktor/schemas'
|
|
26
|
+
SCHEMA_LOCK_PATH = SCHEMA_CACHE_PATH / '.lock'
|
|
27
|
+
CACHE_MAX_AGE_SECONDS = 86400 # 24 hours
|
|
28
|
+
|
|
29
|
+
# Schema URLs for different Kubernetes resources
|
|
30
|
+
SCHEMA_URLS = {
|
|
31
|
+
'podspec': f'https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/{SCHEMA_VERSION}/podspec.json',
|
|
32
|
+
'deployment': f'https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/{SCHEMA_VERSION}/deployment.json',
|
|
33
|
+
'service': f'https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/{SCHEMA_VERSION}/service.json',
|
|
34
|
+
'horizontalpodautoscaler': f'https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/{SCHEMA_VERSION}/horizontalpodautoscaler-autoscaling-v2.json',
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
logger = logging.get_logger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _skip_image_checks() -> bool:
|
|
41
|
+
val = os.getenv('KONDUKTOR_SKIP_IMAGE_CHECK', '')
|
|
42
|
+
return val.lower() in ('1', 'true', 'yes', 'y')
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def case_insensitive_enum(validator, enums, instance, schema):
|
|
46
|
+
del validator, schema # Unused.
|
|
47
|
+
if instance.lower() not in [enum.lower() for enum in enums]:
|
|
48
|
+
yield jsonschema.ValidationError(f'{instance!r} is not one of {enums!r}')
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
SchemaValidator = jsonschema.validators.extend(
|
|
52
|
+
jsonschema.Draft7Validator,
|
|
53
|
+
validators={'case_insensitive_enum': case_insensitive_enum},
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_cached_schema(schema_type: str) -> dict:
|
|
58
|
+
"""Get cached schema for a specific Kubernetes resource type."""
|
|
59
|
+
schema_url = SCHEMA_URLS.get(schema_type)
|
|
60
|
+
if not schema_url:
|
|
61
|
+
raise ValueError(f'Unknown schema type: {schema_type}')
|
|
62
|
+
|
|
63
|
+
schema_file = SCHEMA_CACHE_PATH / f'{schema_type}.json'
|
|
64
|
+
lock = FileLock(str(SCHEMA_LOCK_PATH))
|
|
65
|
+
|
|
66
|
+
with lock:
|
|
67
|
+
# Check if schema file exists and is fresh
|
|
68
|
+
if schema_file.exists():
|
|
69
|
+
age = time.time() - schema_file.stat().st_mtime
|
|
70
|
+
# if fresh
|
|
71
|
+
if age < CACHE_MAX_AGE_SECONDS:
|
|
72
|
+
with open(schema_file, 'r') as f:
|
|
73
|
+
return json.load(f)
|
|
74
|
+
|
|
75
|
+
# Download schema
|
|
76
|
+
resp = requests.get(schema_url)
|
|
77
|
+
resp.raise_for_status()
|
|
78
|
+
|
|
79
|
+
SCHEMA_CACHE_PATH.mkdir(parents=True, exist_ok=True)
|
|
80
|
+
with open(schema_file, 'w') as f:
|
|
81
|
+
f.write(resp.text)
|
|
82
|
+
|
|
83
|
+
return resp.json()
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _validate_k8s_spec(spec: dict, schema_type: str, resource_name: str) -> None:
|
|
87
|
+
"""Generic validation function for Kubernetes specs."""
|
|
88
|
+
schema = get_cached_schema(schema_type)
|
|
89
|
+
|
|
90
|
+
validator = jsonschema.Draft7Validator(schema)
|
|
91
|
+
errors = sorted(validator.iter_errors(spec), key=lambda e: e.path)
|
|
92
|
+
|
|
93
|
+
if not errors:
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
formatted = [
|
|
97
|
+
f'- {error.message}'
|
|
98
|
+
+ (f" at path: {' → '.join(str(p) for p in error.path)}" if error.path else '')
|
|
99
|
+
for error in errors
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# Clean log
|
|
103
|
+
logger.debug('Invalid k8s %s spec/config:\n%s', resource_name, '\n'.join(formatted))
|
|
104
|
+
|
|
105
|
+
# Color only in CLI
|
|
106
|
+
formatted_colored = [
|
|
107
|
+
f'{Fore.RED}- {error.message}'
|
|
108
|
+
+ (f" at path: {' → '.join(str(p) for p in error.path)}" if error.path else '')
|
|
109
|
+
+ Style.RESET_ALL
|
|
110
|
+
for error in errors
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f'\n{Fore.RED}Invalid k8s {resource_name} spec/config: {Style.RESET_ALL}\n'
|
|
115
|
+
+ '\n'.join(formatted_colored)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def validate_pod_spec(pod_spec: dict) -> None:
|
|
120
|
+
"""Validate a Kubernetes pod spec."""
|
|
121
|
+
_validate_k8s_spec(pod_spec, 'podspec', 'pod')
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def validate_deployment_spec(deployment_spec: dict) -> None:
|
|
125
|
+
"""Validate a Kubernetes deployment spec."""
|
|
126
|
+
_validate_k8s_spec(deployment_spec, 'deployment', 'deployment')
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def validate_service_spec(service_spec: dict) -> None:
|
|
130
|
+
"""Validate a Kubernetes service spec."""
|
|
131
|
+
_validate_k8s_spec(service_spec, 'service', 'service')
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def validate_horizontalpodautoscaler_spec(hpa_spec: dict) -> None:
|
|
135
|
+
"""Validate a Kubernetes HorizontalPodAutoscaler spec."""
|
|
136
|
+
_validate_k8s_spec(hpa_spec, 'horizontalpodautoscaler', 'horizontalpodautoscaler')
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def validate_docker_image(image_id: str) -> Tuple[str, str]:
|
|
140
|
+
"""Validate if a Docker image exists and is accessible.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
image_id: The Docker image ID to validate
|
|
144
|
+
(e.g., 'ubuntu:latest', 'gcr.io/project/image:tag')
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Tuple of (status, message) where status is:
|
|
148
|
+
- 'valid': Image definitely exists
|
|
149
|
+
- 'warning': Couldn't validate, but might be valid
|
|
150
|
+
- 'invalid': Image definitely doesn't exist
|
|
151
|
+
"""
|
|
152
|
+
if not image_id or not isinstance(image_id, str):
|
|
153
|
+
return 'invalid', 'Image ID must be a non-empty string'
|
|
154
|
+
|
|
155
|
+
# Basic format validation
|
|
156
|
+
if not _is_valid_docker_image_format(image_id):
|
|
157
|
+
return 'invalid', f'Invalid Docker image format: {image_id}'
|
|
158
|
+
|
|
159
|
+
# Try registry API validation first (works without Docker daemon)
|
|
160
|
+
registry_result = _validate_image_in_registry(image_id)
|
|
161
|
+
if registry_result[0] in ['valid', 'invalid']:
|
|
162
|
+
return registry_result
|
|
163
|
+
|
|
164
|
+
# If registry validation couldn't determine, try local Docker as fallback
|
|
165
|
+
if _can_pull_image_locally(image_id):
|
|
166
|
+
return 'valid', f"Docker image '{image_id}' validated locally"
|
|
167
|
+
|
|
168
|
+
# Return the registry result (warning)
|
|
169
|
+
return registry_result
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _is_valid_docker_image_format(image_id: str) -> bool:
|
|
173
|
+
"""Check if the image ID follows valid Docker image naming conventions."""
|
|
174
|
+
# Basic regex for Docker image names
|
|
175
|
+
# Supports: name:tag, registry/name:tag, registry/namespace/name:tag
|
|
176
|
+
pattern = (
|
|
177
|
+
r'^[a-zA-Z0-9][a-zA-Z0-9._-]*'
|
|
178
|
+
r'(?:\/[a-zA-Z0-9][a-zA-Z0-9._-]*)*'
|
|
179
|
+
r'(?::[a-zA-Z0-9._-]+)?$'
|
|
180
|
+
)
|
|
181
|
+
return bool(re.match(pattern, image_id))
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _can_pull_image_locally(image_id: str) -> bool:
|
|
185
|
+
"""Try to inspect the image manifest locally to check if it exists."""
|
|
186
|
+
try:
|
|
187
|
+
# Use docker manifest inspect instead of pull for faster validation
|
|
188
|
+
result = subprocess.run(
|
|
189
|
+
['docker', 'manifest', 'inspect', image_id],
|
|
190
|
+
capture_output=True,
|
|
191
|
+
text=True,
|
|
192
|
+
timeout=30, # 30 second timeout
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Debug logging
|
|
196
|
+
logger.debug(
|
|
197
|
+
f'Local Docker manifest inspect for {image_id}: '
|
|
198
|
+
f'returncode={result.returncode}, '
|
|
199
|
+
f"stdout='{result.stdout}', "
|
|
200
|
+
f"stderr='{result.stderr}'"
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
return result.returncode == 0
|
|
204
|
+
except (
|
|
205
|
+
subprocess.TimeoutExpired,
|
|
206
|
+
FileNotFoundError,
|
|
207
|
+
subprocess.SubprocessError,
|
|
208
|
+
) as e:
|
|
209
|
+
# Docker not available or timeout
|
|
210
|
+
logger.debug(f'Local Docker manifest inspect failed for {image_id}: {e}')
|
|
211
|
+
return False
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _validate_image_in_registry(image_id: str) -> Tuple[str, str]:
|
|
215
|
+
"""Validate image exists in registry using API calls."""
|
|
216
|
+
try:
|
|
217
|
+
registry, repo, tag = _parse_image_components(image_id)
|
|
218
|
+
|
|
219
|
+
if registry == 'docker.io':
|
|
220
|
+
return _validate_dockerhub_image(repo, tag)
|
|
221
|
+
elif registry.endswith('gcr.io'):
|
|
222
|
+
return _validate_gcr_image(registry, repo, tag)
|
|
223
|
+
elif registry.endswith('ecr.') and '.amazonaws.com' in registry:
|
|
224
|
+
return _validate_ecr_image(registry, repo, tag)
|
|
225
|
+
elif registry == 'nvcr.io':
|
|
226
|
+
return _validate_nvcr_image(registry, repo, tag)
|
|
227
|
+
elif registry == 'ghcr.io':
|
|
228
|
+
return _validate_ghcr_image(registry, repo, tag)
|
|
229
|
+
elif registry == 'quay.io':
|
|
230
|
+
return _validate_quay_image(registry, repo, tag)
|
|
231
|
+
else:
|
|
232
|
+
# For other registries, we can't easily validate without credentials
|
|
233
|
+
# Return warning that we couldn't verify
|
|
234
|
+
return (
|
|
235
|
+
'warning',
|
|
236
|
+
f"Could not validate '{image_id}' in registry {registry} "
|
|
237
|
+
f'(not supported)',
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
except Exception as e:
|
|
241
|
+
logger.debug(f'Error validating image {image_id}: {e}')
|
|
242
|
+
return 'warning', f"Could not validate '{image_id}' due to validation error"
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _parse_image_components(image_id: str) -> Tuple[str, str, str]:
|
|
246
|
+
"""Parse image ID into registry, repository, and tag components."""
|
|
247
|
+
# Default to Docker Hub
|
|
248
|
+
if '/' not in image_id or '.' not in image_id.split('/')[0]:
|
|
249
|
+
registry = 'docker.io'
|
|
250
|
+
# For Docker Hub official images (single word), add 'library/' prefix
|
|
251
|
+
if ':' in image_id:
|
|
252
|
+
repo, tag = image_id.rsplit(':', 1)
|
|
253
|
+
else:
|
|
254
|
+
repo = image_id
|
|
255
|
+
tag = 'latest'
|
|
256
|
+
# Only add 'library/' prefix for single-word official images
|
|
257
|
+
if '/' not in repo:
|
|
258
|
+
repo = f'library/{repo}'
|
|
259
|
+
else:
|
|
260
|
+
parts = image_id.split('/')
|
|
261
|
+
if '.' in parts[0] or parts[0] in ['localhost']:
|
|
262
|
+
registry = parts[0]
|
|
263
|
+
repo = '/'.join(parts[1:])
|
|
264
|
+
else:
|
|
265
|
+
registry = 'docker.io'
|
|
266
|
+
repo = image_id
|
|
267
|
+
|
|
268
|
+
# Split repository and tag
|
|
269
|
+
if ':' in repo:
|
|
270
|
+
repo, tag = repo.rsplit(':', 1)
|
|
271
|
+
else:
|
|
272
|
+
tag = 'latest'
|
|
273
|
+
|
|
274
|
+
return registry, repo, tag
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _validate_dockerhub_image(repo: str, tag: str) -> Tuple[str, str]:
|
|
278
|
+
"""Validate image exists in Docker Hub using the official API."""
|
|
279
|
+
try:
|
|
280
|
+
# Use Docker Hub's official API v2 endpoint
|
|
281
|
+
# This endpoint checks if a specific tag exists for a repository
|
|
282
|
+
url = f'https://registry.hub.docker.com/v2/repositories/{repo}/tags/{tag}'
|
|
283
|
+
|
|
284
|
+
# Add User-Agent to avoid being blocked
|
|
285
|
+
headers = {'User-Agent': 'Konduktor-Docker-Validator/1.0'}
|
|
286
|
+
|
|
287
|
+
response = requests.get(url, headers=headers, timeout=10)
|
|
288
|
+
|
|
289
|
+
if response.status_code == 200:
|
|
290
|
+
return 'valid', f"Docker image '{repo}:{tag}' validated via Docker Hub"
|
|
291
|
+
else:
|
|
292
|
+
# API error, can't determine
|
|
293
|
+
return ('warning', f"Could not validate '{repo}:{tag}' in Docker Hub")
|
|
294
|
+
|
|
295
|
+
except requests.RequestException:
|
|
296
|
+
# Network error, can't determine
|
|
297
|
+
return (
|
|
298
|
+
'warning',
|
|
299
|
+
f"Could not validate '{repo}:{tag}' in Docker Hub " f'(network error)',
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _validate_gcr_image(registry: str, repo: str, tag: str) -> Tuple[str, str]:
|
|
304
|
+
"""Validate image exists in Google Container Registry."""
|
|
305
|
+
try:
|
|
306
|
+
# GCR manifest endpoint
|
|
307
|
+
url = f'https://{registry}/v2/{repo}/manifests/{tag}'
|
|
308
|
+
response = requests.get(url, timeout=10)
|
|
309
|
+
|
|
310
|
+
if response.status_code == 200:
|
|
311
|
+
return 'valid', f"Docker image '{repo}:{tag}' validated via {registry}"
|
|
312
|
+
else:
|
|
313
|
+
# API error, can't determine
|
|
314
|
+
return ('warning', f"Could not validate '{repo}:{tag}' in {registry} ")
|
|
315
|
+
|
|
316
|
+
except requests.RequestException:
|
|
317
|
+
# Network error, can't determine
|
|
318
|
+
return (
|
|
319
|
+
'warning',
|
|
320
|
+
f"Could not validate '{repo}:{tag}' in {registry} " f'(network error)',
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _validate_ecr_image(registry: str, repo: str, tag: str) -> Tuple[str, str]:
|
|
325
|
+
"""Validate image exists in Amazon ECR."""
|
|
326
|
+
# ECR requires AWS credentials and is complex to validate
|
|
327
|
+
# For now, return warning that we couldn't verify
|
|
328
|
+
return ('warning', f"Could not validate '{repo}:{tag}' in {registry}")
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _validate_nvcr_image(registry: str, repo: str, tag: str) -> Tuple[str, str]:
|
|
332
|
+
"""Validate image exists in NVIDIA Container Registry."""
|
|
333
|
+
# NVCR requires NVIDIA credentials and is complex to validate
|
|
334
|
+
# For now, return warning that we couldn't verify
|
|
335
|
+
return ('warning', f"Could not validate '{repo}:{tag}' in {registry}")
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _validate_ghcr_image(registry: str, repo: str, tag: str) -> Tuple[str, str]:
|
|
339
|
+
"""Validate image exists in GitHub Container Registry."""
|
|
340
|
+
try:
|
|
341
|
+
# Check if GITHUB_TOKEN is available
|
|
342
|
+
github_token = os.environ.get('GITHUB_TOKEN')
|
|
343
|
+
|
|
344
|
+
# If not in environment, try to get from konduktor secrets
|
|
345
|
+
if not github_token:
|
|
346
|
+
try:
|
|
347
|
+
# these imports are inside the try block to avoid circular import error
|
|
348
|
+
from konduktor.backends import constants as backend_constants
|
|
349
|
+
from konduktor.utils import common_utils, kubernetes_utils
|
|
350
|
+
|
|
351
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
352
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
353
|
+
user_hash = common_utils.get_user_hash()
|
|
354
|
+
label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
|
|
355
|
+
user_secrets = kubernetes_utils.list_secrets(
|
|
356
|
+
namespace, context, label_filter=label_selector
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
for secret in user_secrets:
|
|
360
|
+
kind = kubernetes_utils.get_secret_kind(secret)
|
|
361
|
+
if kind == 'env' and secret.data and 'GITHUB_TOKEN' in secret.data:
|
|
362
|
+
# Decode the base64 encoded token
|
|
363
|
+
github_token = base64.b64decode(
|
|
364
|
+
secret.data['GITHUB_TOKEN']
|
|
365
|
+
).decode()
|
|
366
|
+
logger.debug('GITHUB_TOKEN found in konduktor secret')
|
|
367
|
+
break
|
|
368
|
+
|
|
369
|
+
except Exception as e:
|
|
370
|
+
logger.debug(f'Failed to check konduktor secrets: {e}')
|
|
371
|
+
|
|
372
|
+
if not github_token:
|
|
373
|
+
return (
|
|
374
|
+
'warning',
|
|
375
|
+
'GITHUB_TOKEN unset, cannot verify this image. '
|
|
376
|
+
'To enable validation, either:\n'
|
|
377
|
+
' 1. Set GITHUB_TOKEN locally: export GITHUB_TOKEN=<token>\n'
|
|
378
|
+
' 2. Create a secret: konduktor secret create --kind=env '
|
|
379
|
+
'--inline GITHUB_TOKEN=<token> <name>\n'
|
|
380
|
+
'See: https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry',
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Base64 encode the token
|
|
384
|
+
ghcr_token = base64.b64encode(github_token.encode()).decode()
|
|
385
|
+
|
|
386
|
+
# GHCR manifest endpoint
|
|
387
|
+
url = f'https://{registry}/v2/{repo}/manifests/{tag}'
|
|
388
|
+
headers = {'Authorization': f'Bearer {ghcr_token}'}
|
|
389
|
+
response = requests.get(url, headers=headers, timeout=10)
|
|
390
|
+
|
|
391
|
+
if response.status_code == 200:
|
|
392
|
+
return 'valid', f"Docker image '{repo}:{tag}' validated via {registry}"
|
|
393
|
+
else:
|
|
394
|
+
# API error, can't determine
|
|
395
|
+
return ('warning', f"Could not validate '{repo}:{tag}' in {registry}")
|
|
396
|
+
|
|
397
|
+
except requests.RequestException:
|
|
398
|
+
# Network error, can't determine
|
|
399
|
+
return (
|
|
400
|
+
'warning',
|
|
401
|
+
f"Could not validate '{repo}:{tag}' in {registry} " f'(network error)',
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _validate_quay_image(registry: str, repo: str, tag: str) -> Tuple[str, str]:
|
|
406
|
+
"""Validate image exists in Quay.io Container Registry."""
|
|
407
|
+
# Quay.io requires authentication and is complex to validate
|
|
408
|
+
# For now, return warning that we couldn't verify
|
|
409
|
+
return ('warning', f"Could not validate '{repo}:{tag}' in {registry}")
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
# Track which images we've already warned about to avoid duplicate warnings
|
|
413
|
+
_warned_images = set()
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def validate_and_warn_image(image_id: str, context: str = 'task') -> None:
|
|
417
|
+
"""Validate Docker image and show appropriate warnings.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
image_id: The Docker image ID to validate
|
|
421
|
+
context: Context for the validation (e.g., "task", "deployment")
|
|
422
|
+
|
|
423
|
+
"""
|
|
424
|
+
if not image_id:
|
|
425
|
+
return
|
|
426
|
+
|
|
427
|
+
if _skip_image_checks():
|
|
428
|
+
logger.info(
|
|
429
|
+
'Skipping Docker image validation for %s',
|
|
430
|
+
image_id,
|
|
431
|
+
)
|
|
432
|
+
return
|
|
433
|
+
|
|
434
|
+
status, message = validate_docker_image(image_id)
|
|
435
|
+
|
|
436
|
+
if status == 'invalid':
|
|
437
|
+
# Invalid images should fail - they definitely don't exist
|
|
438
|
+
raise ValueError(
|
|
439
|
+
f'{message}\n'
|
|
440
|
+
f'This Docker image does not exist and will cause the {context} to fail.\n'
|
|
441
|
+
f"Please check that the image '{image_id}' is correct and accessible.\n"
|
|
442
|
+
)
|
|
443
|
+
elif status == 'warning':
|
|
444
|
+
# Only warn once per image per session for warnings
|
|
445
|
+
if image_id not in _warned_images:
|
|
446
|
+
_warned_images.add(image_id)
|
|
447
|
+
|
|
448
|
+
logger.warning(
|
|
449
|
+
f'⚠️ Basic public image validation using Docker Daemon failed. ⚠️\n'
|
|
450
|
+
f'⚠️ {message} ⚠️\n'
|
|
451
|
+
f'⚠️ The {context} will be submitted anyway, but may be stuck '
|
|
452
|
+
f'PENDING forever. ⚠️\n'
|
|
453
|
+
f"⚠️ Check for 'ErrImagePull' or 'ImagePullBackOff' in "
|
|
454
|
+
f'kubectl get pods if issues occur. ⚠️'
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
# Add info about private registries
|
|
458
|
+
logger.info(
|
|
459
|
+
'⚠️ If pulling from a private registry, using ecr/nvcr, or not '
|
|
460
|
+
'logged into Docker, this is safe to ignore. ⚠️'
|
|
461
|
+
)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Trainy Software License
|
|
2
|
+
## Version 1.0
|
|
3
|
+
|
|
4
|
+
Last Updated: June 12, 2024
|
|
5
|
+
Copyright © 2024 Trainy, Inc.
|
|
6
|
+
|
|
7
|
+
This Trainy Software License (“License”), effective the date we first make the Software available to you (“Effective Date”) is between the individual or legal entity exercising permissions granted by this License (“you” or “your”) and Trainy, Inc. (“Trainy” or “we” or “us” or “our”) concerning your use of the work of authorship, whether in source or object form, made available under this License, as indicated by a copyright notice that is included in or attached to the work (the “Software”).
|
|
8
|
+
|
|
9
|
+
# Authority
|
|
10
|
+
If you use the Software on behalf of another person or legal entity, (a) all references to “you” throughout this License will include that person or legal entity, (b) you represent that you are authorized to accept this License on that person’s or legal entity’s behalf, and (c) in the event you or the person or legal entity violates this License, the person or legal entity agrees to be responsible to us.
|
|
11
|
+
|
|
12
|
+
# Licenses.
|
|
13
|
+
|
|
14
|
+
## Copyright License.
|
|
15
|
+
|
|
16
|
+
Subject to your compliance with this License, Trainy hereby grants you a non-exclusive, royalty-free copyright license to reproduce, modify, create derivative works, publicly perform, publicly display and redistribute the Software for any purpose that is not a Restriction (as defined below).
|
|
17
|
+
|
|
18
|
+
## Limited Patent License.
|
|
19
|
+
Subject to your compliance with this License, the license grant above includes a license under our Applicable Patents. Notwithstanding the foregoing, If you initiate any claim, action or proceeding against any third party (including cross-claims or counterclaims) alleging that all or any portion of the Software infringes the patents of any entity or other person, the license granted under this Section 2.b will immediately terminate automatically without further action by either party. “Applicable Patents” are any of patents licensable by us that will be necessarily infringed by using the Software.
|
|
20
|
+
|
|
21
|
+
## Restrictions.
|
|
22
|
+
A “Restriction” means distributing or otherwise commercially making the Software available in a manner that provides functionality to address the same or substantially similar user requirements as the Software or includes the same or similar functionality as the Software. Without limiting the foregoing, you may use the Software for your internal purposes and for non-commercial purposes. We grant you no rights except as expressly set forth in this License.
|
|
23
|
+
|
|
24
|
+
## Derivatives; Marking
|
|
25
|
+
If you distribute any copies, improvements, modifications or derivatives (“Derivatives”) of the Software, this License will apply to you must include a copy of or a link to this License and include with such Derivatives any copyright, patent, trademark and attribution notices provided in or with the Software. You may reproduce our trademarks, trade names, service marks and other similar indicia (“Trademarks”) solely to the extent necessary to reproduce and/or link to applicable notices as provided in this Section 4. Except as provided in the prior sentence, we grant you no rights in or to our Trademarks, whether expressly, by implication, estoppel or otherwise.
|
|
26
|
+
|
|
27
|
+
## Disclaimer.
|
|
28
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTIES OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION WARRANTIES OF FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, TITLE OR NON-INFRINGEMENT. IN NO EVENT WILL WE HAVE ANY LIABILITY TO YOU ARISING OUT OF OR RELATED TO THE SOFTWARE, INCLUDING INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES, EVEN IF WE HAVE BEEN INFORMED OF THEIR POSSIBILITY IN ADVANCE.
|
|
29
|
+
|
|
30
|
+
## Apache License.
|
|
31
|
+
Subject to your compliance with this License, Trainy hereby grants you a license to use the Software under the Apache License, Version 2.0, currently available at http://www.apache.org/licenses/LICENSE-2.0 (the “Apache License”), provided that you hereby covenant and agree not to use the Software under the Apache License until the third anniversary of the Effective Date. On or after such date, you may continue to use the Software under the terms of this License, or you may use the Software under the terms and conditions of, and in compliance with, the Apache License.
|
|
32
|
+
|
|
33
|
+
--------------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
The following files were modified from:
|
|
36
|
+
|
|
37
|
+
Code in manifests/kube-prometheus-stack.values modified from
|
|
38
|
+
https://github.com/prometheus-community/helm-charts
|
|
39
|
+
Git Revision: db15dcf70d06da9c71454f7d9757b47b04d2c5ae
|
|
40
|
+
|
|
41
|
+
Copyright 2024 Prometheus community Helm charts developers
|
|
42
|
+
|
|
43
|
+
Code in manifests/kube-prometheus-stack.values modified from
|
|
44
|
+
https://github.com/prometheus-community/helm-charts
|
|
45
|
+
Git Revision: db15dcf70d06da9c71454f7d9757b47b04d2c5ae
|
|
46
|
+
|
|
47
|
+
Code in:
|
|
48
|
+
manifests/manifests.yaml
|
|
49
|
+
manifests/single-clusterqueue-setup.yaml
|
|
50
|
+
modified from
|
|
51
|
+
https://github.com/kubernetes-sigs/kueue/
|
|
52
|
+
Git Revision: bf050331ad8917afa3e3ddc92e3c8153b2a654d2
|
|
53
|
+
|
|
54
|
+
Copyright 2022 The Kubernetes Authors
|
|
55
|
+
|
|
56
|
+
Code in grafana/ modified from
|
|
57
|
+
https://github.com/NVIDIA/dcgm-exporter/
|
|
58
|
+
Git Revision: 478fab1573c7a1f0db4358357733967e4ec52200
|
|
59
|
+
|
|
60
|
+
Copyright 2022 NVIDIA
|
|
61
|
+
|
|
62
|
+
The original files are licensed under the Apache License, Version 2.0 (the "License");
|
|
63
|
+
you may not use this file except in compliance with the License.
|
|
64
|
+
You may obtain a copy of the License at
|
|
65
|
+
|
|
66
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
67
|
+
|
|
68
|
+
Unless required by applicable law or agreed to in writing, software
|
|
69
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
70
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
71
|
+
See the License for the specific language governing permissions and
|
|
72
|
+
limitations under the License.
|
|
73
|
+
|
|
74
|
+
The modifications are proprietary and subject to the terms of the Trainy Software License Version 1.0
|
|
75
|
+
Copyright 2024 Trainy Inc.
|
|
76
|
+
|
|
77
|
+
Code is modified from https://github.com/skypilot-org/skypilot
|
|
78
|
+
|
|
79
|
+
The original files are licensed under the Apache License, Version 2.0 (the "License");
|
|
80
|
+
you may not use this file except in compliance with the License.
|
|
81
|
+
You may obtain a copy of the License at
|
|
82
|
+
|
|
83
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
84
|
+
|
|
85
|
+
Unless required by applicable law or agreed to in writing, software
|
|
86
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
87
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
88
|
+
See the License for the specific language governing permissions and
|
|
89
|
+
limitations under the License.
|
|
90
|
+
The modifications are proprietary and subject to the terms of the Trainy Software License Version 1.0
|
|
91
|
+
Copyright 2024 Trainy Inc.
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: konduktor-nightly
|
|
3
|
+
Version: 0.1.0.dev20251128104812
|
|
4
|
+
Summary: GPU Cluster Health Management
|
|
5
|
+
Author: Andrew Aikawa
|
|
6
|
+
Author-email: asai@berkeley.edu
|
|
7
|
+
Requires-Python: >=3.9,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Provides-Extra: s3
|
|
15
|
+
Requires-Dist: awscli[s3] (>=1.32.84,<2.0.0) ; extra == "s3"
|
|
16
|
+
Requires-Dist: boto3[s3] (>=1.34.84,<2.0.0) ; extra == "s3"
|
|
17
|
+
Requires-Dist: botocore[s3] (>=1.34.84,<2.0.0) ; extra == "s3"
|
|
18
|
+
Requires-Dist: click (>=8.1.7,<9.0.0)
|
|
19
|
+
Requires-Dist: colorama (>=0.4.6,<0.5.0)
|
|
20
|
+
Requires-Dist: filelock (>=3.18.0,<4.0.0)
|
|
21
|
+
Requires-Dist: google-api-python-client[gcp] (>=2.161.0,<3.0.0)
|
|
22
|
+
Requires-Dist: google-cloud-storage[gcp] (>=3.0.0,<4.0.0)
|
|
23
|
+
Requires-Dist: jinja2 (>=3.1.5,<4.0.0)
|
|
24
|
+
Requires-Dist: jsonschema (>=4.23.0,<5.0.0)
|
|
25
|
+
Requires-Dist: kr8s (>=0.20.1,<0.21.0)
|
|
26
|
+
Requires-Dist: kubernetes (>=30.1.0,<31.0.0)
|
|
27
|
+
Requires-Dist: posthog (>=3.7.4,<4.0.0)
|
|
28
|
+
Requires-Dist: prettytable (>=3.12.0,<4.0.0)
|
|
29
|
+
Requires-Dist: psutil (>=7.0.0,<8.0.0)
|
|
30
|
+
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
|
31
|
+
Requires-Dist: rich (>=13.9.4,<14.0.0)
|
|
32
|
+
Requires-Dist: websockets (>=15.0.1,<16.0.0)
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
<p align="center">
|
|
37
|
+
<picture>
|
|
38
|
+
<img alt="Trainy Konduktor Logo" src="https://raw.githubusercontent.com/Trainy-ai/konduktor/main/docs/source/images/konduktor-logo-white-no-background.png" width="353" height="64" style="max-width: 100%;">
|
|
39
|
+
</picture>
|
|
40
|
+
<br/>
|
|
41
|
+
<br/>
|
|
42
|
+
</p>
|
|
43
|
+
|
|
44
|
+
Built on [Kubernetes](https://kubernetes.io). Konduktor uses existing open source tools to build a platform that makes it easy for ML Researchers to submit batch jobs and for administrative/infra teams to easily manage GPU clusters.
|
|
45
|
+
|
|
46
|
+
## How it works
|
|
47
|
+
|
|
48
|
+
Konduktor uses a combination of open source projects. Where tools exist with MIT, Apache, or another compatible open license, we want to use and even contribute to that tool. Where we see gaps in tooling, we build it.
|
|
49
|
+
|
|
50
|
+
### Architecture
|
|
51
|
+
|
|
52
|
+
Konduktor can be self-hosted and run on any certified Kubernetes distribution or managed by us. Contact us at founders@trainy.ai if you are just interested in the managed version. We're focused on tooling for clusters with NVIDIA cards for now but in the future we may expand to our scope to support other accelerators.
|
|
53
|
+
|
|
54
|
+
<p align="center">
|
|
55
|
+
<img alt="architecture" src="https://raw.githubusercontent.com/Trainy-ai/konduktor/main/docs/source/images/architecture.png" width=80%>
|
|
56
|
+
</p>
|
|
57
|
+
|
|
58
|
+
For ML researchers
|
|
59
|
+
- Konduktor CLI & SDK - user friendly batch job framework, where users only need to specify the resource requirements of their job and a script to launch that makes simple to scale work across multiple nodes. Works with most ML application frameworks out of the box.
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
num_nodes: 100
|
|
63
|
+
|
|
64
|
+
resources:
|
|
65
|
+
accelerators: H100:8
|
|
66
|
+
cloud: kubernetes
|
|
67
|
+
labels:
|
|
68
|
+
kueue.x-k8s.io/queue-name: user-queue
|
|
69
|
+
kueue.x-k8s.io/priority-class: low-priority
|
|
70
|
+
|
|
71
|
+
run: |
|
|
72
|
+
torchrun \
|
|
73
|
+
--nproc_per_node 8 \
|
|
74
|
+
--rdzv_id=1 --rdzv_endpoint=$master_addr:1234 \
|
|
75
|
+
--rdzv_backend=c10d --nnodes $num_nodes \
|
|
76
|
+
torch_ddp_benchmark.py --distributed-backend nccl
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
For cluster administrators
|
|
80
|
+
- [DCGM Exporter](https://github.com/NVIDIA/dcgm-exporter), [GPU operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/), [Network Operator](https://github.com/Mellanox/network-operator) - For installing NVIDIA driver, container runtime, and exporting node health metrics.
|
|
81
|
+
- [Kueue](https://kueue.sigs.k8s.io/docs/) - centralized creation of job queues, gang-scheduling, and resource quotas and sharing across projects.
|
|
82
|
+
- [Prometheus](https://prometheus.io/) - For publishing metrics about node health and workload queues.
|
|
83
|
+
- [OpenTelemetry](https://opentelemetry.io/) - For pushing logs from each node
|
|
84
|
+
- [Grafana, Loki](https://grafana.com/) - Visualizations for metrics/logging solution.
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
## Community & Support
|
|
88
|
+
- [Discord](https://discord.com/invite/HQUBJSVgAP)
|
|
89
|
+
- founders@trainy.ai
|
|
90
|
+
|
|
91
|
+
## Contributor Guide
|
|
92
|
+
|
|
93
|
+
Format your code with
|
|
94
|
+
```
|
|
95
|
+
poetry install --with dev
|
|
96
|
+
bash format.sh
|
|
97
|
+
```
|
|
98
|
+
|