konduktor-nightly 0.1.0.dev20250810104857__tar.gz → 0.1.0.dev20250812105102__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of konduktor-nightly might be problematic. Click here for more details.
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/PKG-INFO +1 -1
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/__init__.py +2 -2
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/constants.py +1 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/jobset.py +3 -2
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/jobset_utils.py +45 -24
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/cli.py +25 -15
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/logging.py +6 -4
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/templates/jobset.yaml.j2 +3 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/log_utils.py +29 -22
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/ux_utils.py +25 -11
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/pyproject.toml +1 -1
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/README.md +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/adaptors/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/adaptors/aws.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/adaptors/common.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/adaptors/gcp.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/authentication.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/backend.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/deployment.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/deployment_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/pod_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/check.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/config.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/controller/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/controller/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/controller/launch.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/controller/node.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/controller/parse.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/README.md +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/backend/main.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/backend/sockets.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/.gitignore +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/globals.css +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/layout.js +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/page.js +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/jsconfig.json +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/next.config.mjs +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/package-lock.json +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/package.json +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/server.js +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/aws/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/aws/s3.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/data_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/gcp/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/gcp/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/gcp/gcs.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/gcp/utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/registry.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/storage.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/storage_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/execution.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/kube_client.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/manifests/controller_deployment.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/manifests/dashboard_deployment.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/resource.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/serving.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/task.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/templates/deployment.yaml.j2 +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/templates/pod.yaml.j2 +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/usage/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/usage/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/__init__.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/accelerator_registry.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/annotations.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/base64_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/common_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/constants.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/env_options.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/exceptions.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/kubernetes_enums.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/kubernetes_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/loki_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/rich_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/schemas.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/subprocess_utils.py +0 -0
- {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/validator.py +0 -0
|
@@ -11,7 +11,7 @@ from konduktor.task import Task
|
|
|
11
11
|
__all__ = ['launch', 'Resources', 'Task', 'Serving']
|
|
12
12
|
|
|
13
13
|
# Replaced with the current commit when building the wheels.
|
|
14
|
-
_KONDUKTOR_COMMIT_SHA = '
|
|
14
|
+
_KONDUKTOR_COMMIT_SHA = 'f4ba2084fac1c1030245b475323f4f3a57fd3fa3'
|
|
15
15
|
os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
|
|
16
16
|
|
|
17
17
|
|
|
@@ -45,5 +45,5 @@ def _get_git_commit():
|
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
__commit__ = _get_git_commit()
|
|
48
|
-
__version__ = '1.0.0.dev0.1.0.
|
|
48
|
+
__version__ = '1.0.0.dev0.1.0.dev20250812105102'
|
|
49
49
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
@@ -8,6 +8,7 @@ USERID_LABEL = 'trainy.ai/user-id'
|
|
|
8
8
|
USER_LABEL = 'trainy.ai/username'
|
|
9
9
|
ACCELERATOR_LABEL = 'trainy.ai/accelerator'
|
|
10
10
|
NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
|
|
11
|
+
MAX_EXECUTION_TIME_LABEL = 'kueue.x-k8s.io/max-exec-time-seconds'
|
|
11
12
|
|
|
12
13
|
# Start/stop/status related labels
|
|
13
14
|
STOP_USERID_LABEL = 'trainy.ai/stop-userid'
|
|
@@ -176,7 +176,7 @@ class JobsetBackend(backend.Backend):
|
|
|
176
176
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
177
177
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
178
178
|
# TODO(asaiacai): need to set env variables in pod
|
|
179
|
-
jobset_utils.create_jobset(
|
|
179
|
+
jobset_response = jobset_utils.create_jobset(
|
|
180
180
|
namespace,
|
|
181
181
|
task,
|
|
182
182
|
pod_spec['kubernetes']['pod_config'],
|
|
@@ -192,9 +192,10 @@ class JobsetBackend(backend.Backend):
|
|
|
192
192
|
):
|
|
193
193
|
_wait_for_jobset_start(namespace, task.name)
|
|
194
194
|
try:
|
|
195
|
+
assert jobset_response is not None
|
|
195
196
|
log_thread = threading.Thread(
|
|
196
197
|
target=log_utils.tail_logs,
|
|
197
|
-
args=(
|
|
198
|
+
args=(jobset_response,),
|
|
198
199
|
daemon=True,
|
|
199
200
|
)
|
|
200
201
|
logger.info('streaming logs...')
|
|
@@ -39,6 +39,7 @@ JOBSET_USERID_LABEL = backend_constants.USERID_LABEL
|
|
|
39
39
|
JOBSET_USER_LABEL = backend_constants.USER_LABEL
|
|
40
40
|
JOBSET_ACCELERATOR_LABEL = backend_constants.ACCELERATOR_LABEL
|
|
41
41
|
JOBSET_NUM_ACCELERATORS_LABEL = backend_constants.NUM_ACCELERATORS_LABEL
|
|
42
|
+
JOBSET_MAX_EXECUTION_TIME_LABEL = backend_constants.MAX_EXECUTION_TIME_LABEL
|
|
42
43
|
|
|
43
44
|
SECRET_BASENAME_LABEL = backend_constants.SECRET_BASENAME_LABEL
|
|
44
45
|
|
|
@@ -48,6 +49,7 @@ _JOBSET_METADATA_LABELS = {
|
|
|
48
49
|
'jobset_user_label': JOBSET_USER_LABEL,
|
|
49
50
|
'jobset_accelerator_label': JOBSET_ACCELERATOR_LABEL,
|
|
50
51
|
'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
|
|
52
|
+
'jobset_max_execution_time_label': JOBSET_MAX_EXECUTION_TIME_LABEL,
|
|
51
53
|
}
|
|
52
54
|
|
|
53
55
|
|
|
@@ -79,6 +81,7 @@ def create_jobset(
|
|
|
79
81
|
assert task.resources is not None, 'Task resources are undefined'
|
|
80
82
|
accelerator_type = task.resources.get_accelerator_type() or 'None'
|
|
81
83
|
num_accelerators = task.resources.get_accelerator_count() or 0
|
|
84
|
+
labels = task.resources.labels if task.resources.labels else {}
|
|
82
85
|
with tempfile.NamedTemporaryFile() as temp:
|
|
83
86
|
common_utils.fill_template(
|
|
84
87
|
'jobset.yaml.j2',
|
|
@@ -91,6 +94,7 @@ def create_jobset(
|
|
|
91
94
|
'num_accelerators': num_accelerators,
|
|
92
95
|
'completions': task.resources.get_completions(),
|
|
93
96
|
'max_restarts': task.resources.get_max_restarts(),
|
|
97
|
+
'max_execution_time': labels.get('maxRunDurationSeconds', None),
|
|
94
98
|
**_JOBSET_METADATA_LABELS,
|
|
95
99
|
},
|
|
96
100
|
temp.name,
|
|
@@ -232,8 +236,12 @@ def stop_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
|
|
|
232
236
|
'spec': {'suspend': True},
|
|
233
237
|
'metadata': {
|
|
234
238
|
'annotations': {
|
|
235
|
-
backend_constants.STOP_USERID_LABEL:
|
|
236
|
-
|
|
239
|
+
backend_constants.STOP_USERID_LABEL: (
|
|
240
|
+
common_utils.user_and_hostname_hash()
|
|
241
|
+
),
|
|
242
|
+
backend_constants.STOP_USERNAME_LABEL: (
|
|
243
|
+
common_utils.get_cleaned_username()
|
|
244
|
+
),
|
|
237
245
|
}
|
|
238
246
|
},
|
|
239
247
|
}
|
|
@@ -426,6 +434,36 @@ def _parse_timestamp_filter(timestamp_str: str) -> datetime:
|
|
|
426
434
|
)
|
|
427
435
|
|
|
428
436
|
|
|
437
|
+
def _format_timestamp(timestamp: str) -> str:
|
|
438
|
+
"""Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
|
|
439
|
+
# Parse UTC timestamp and convert to local time
|
|
440
|
+
dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
|
|
441
|
+
tzinfo=timezone.utc
|
|
442
|
+
)
|
|
443
|
+
dt_local = dt_utc.astimezone() # Convert to local timezone
|
|
444
|
+
return dt_local.strftime('%m/%d/%y %I:%M%p')
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def _get_job_start_time(job: Dict[str, Any]) -> str:
|
|
448
|
+
for condition in job['status']['conditions']:
|
|
449
|
+
if condition['reason'] == 'ResumeJobs':
|
|
450
|
+
return condition.get('lastTransitionTime', '')
|
|
451
|
+
return '-'
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
|
|
455
|
+
"""Extract end time from JobSet conditions (Completed or Failed)"""
|
|
456
|
+
conditions = job.get('status', {}).get('conditions', [])
|
|
457
|
+
for condition in conditions:
|
|
458
|
+
# Look for terminal conditions with status=True
|
|
459
|
+
if (
|
|
460
|
+
condition.get('type') in ['Completed', 'Failed']
|
|
461
|
+
and condition.get('status') == 'True'
|
|
462
|
+
):
|
|
463
|
+
return condition.get('lastTransitionTime', '')
|
|
464
|
+
return '-'
|
|
465
|
+
|
|
466
|
+
|
|
429
467
|
def show_status_table(
|
|
430
468
|
namespace: str,
|
|
431
469
|
all_users: bool,
|
|
@@ -519,15 +557,6 @@ def show_status_table(
|
|
|
519
557
|
result = f'{days_str}{hours_str}{minutes_str}{seconds_str}'
|
|
520
558
|
return result if result else '<1 minute', delta
|
|
521
559
|
|
|
522
|
-
def _format_timestamp(timestamp: str) -> str:
|
|
523
|
-
"""Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
|
|
524
|
-
# Parse UTC timestamp and convert to local time
|
|
525
|
-
dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
|
|
526
|
-
tzinfo=timezone.utc
|
|
527
|
-
)
|
|
528
|
-
dt_local = dt_utc.astimezone() # Convert to local timezone
|
|
529
|
-
return dt_local.strftime('%m/%d/%y %I:%M%p')
|
|
530
|
-
|
|
531
560
|
def _get_resources(job: Dict[str, Any]) -> str:
|
|
532
561
|
num_pods = int(
|
|
533
562
|
job['spec']['replicatedJobs'][0]['template']['spec']['parallelism']
|
|
@@ -587,25 +616,17 @@ def show_status_table(
|
|
|
587
616
|
if before_dt and job_creation_time >= before_dt:
|
|
588
617
|
continue
|
|
589
618
|
# Get start time
|
|
590
|
-
start_time =
|
|
619
|
+
start_time = _get_job_start_time(job)
|
|
620
|
+
if start_time != '-':
|
|
621
|
+
start_time = _format_timestamp(start_time)
|
|
591
622
|
|
|
592
623
|
# Get submitted time (how long ago)
|
|
593
624
|
submitted_time, _ = _get_time_delta(job['metadata']['creationTimestamp'])
|
|
594
625
|
|
|
595
626
|
# Get end time (from JobSet conditions)
|
|
596
|
-
def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
|
|
597
|
-
"""Extract end time from JobSet conditions (Completed or Failed)"""
|
|
598
|
-
conditions = job.get('status', {}).get('conditions', [])
|
|
599
|
-
for condition in conditions:
|
|
600
|
-
# Look for terminal conditions with status=True
|
|
601
|
-
if (
|
|
602
|
-
condition.get('type') in ['Completed', 'Failed']
|
|
603
|
-
and condition.get('status') == 'True'
|
|
604
|
-
):
|
|
605
|
-
return _format_timestamp(condition.get('lastTransitionTime', ''))
|
|
606
|
-
return '-'
|
|
607
|
-
|
|
608
627
|
end_time = _get_end_time_from_conditions(job)
|
|
628
|
+
if end_time != '-':
|
|
629
|
+
end_time = _format_timestamp(end_time)
|
|
609
630
|
|
|
610
631
|
if all_users:
|
|
611
632
|
rows.append(
|
|
@@ -732,7 +732,7 @@ def logs(
|
|
|
732
732
|
# Verify the job exists before attempting to tail logs
|
|
733
733
|
# TODO(asaiacai): unify the 404 logic under jobset_utils
|
|
734
734
|
try:
|
|
735
|
-
jobset_utils.get_jobset(namespace, job_id)
|
|
735
|
+
jobset_response = jobset_utils.get_jobset(namespace, job_id)
|
|
736
736
|
except jobset_utils.JobNotFoundError:
|
|
737
737
|
raise click.UsageError(
|
|
738
738
|
f"Job '{job_id}' not found in namespace "
|
|
@@ -741,12 +741,9 @@ def logs(
|
|
|
741
741
|
f'{colorama.Style.RESET_ALL}.'
|
|
742
742
|
)
|
|
743
743
|
|
|
744
|
-
|
|
745
|
-
'Logs are tailed from 1 hour ago, ' 'to see more logs, check Grafana.',
|
|
746
|
-
fg='yellow',
|
|
747
|
-
)
|
|
744
|
+
assert isinstance(jobset_response, dict), f'jobset_response: {jobset_response}'
|
|
748
745
|
log_utils.tail_logs(
|
|
749
|
-
|
|
746
|
+
jobset_response,
|
|
750
747
|
worker_id=node_rank,
|
|
751
748
|
follow=follow,
|
|
752
749
|
num_logs=num_lines,
|
|
@@ -855,16 +852,23 @@ def launch(
|
|
|
855
852
|
'Serving information detected. Use '
|
|
856
853
|
'`konduktor serve launch` instead for serving.'
|
|
857
854
|
)
|
|
855
|
+
try:
|
|
856
|
+
_launch_with_confirm(
|
|
857
|
+
task,
|
|
858
|
+
dryrun=dryrun,
|
|
859
|
+
detach_run=detach_run,
|
|
860
|
+
no_confirm=yes,
|
|
861
|
+
serving=bool(task.serving),
|
|
862
|
+
)
|
|
863
|
+
except KeyboardInterrupt:
|
|
864
|
+
click.secho(
|
|
865
|
+
f'Detaching... manage your job {task.name} with the following commands:',
|
|
866
|
+
fg='yellow',
|
|
867
|
+
bold=True,
|
|
868
|
+
)
|
|
858
869
|
|
|
859
|
-
job_name = _launch_with_confirm(
|
|
860
|
-
task,
|
|
861
|
-
dryrun=dryrun,
|
|
862
|
-
detach_run=detach_run,
|
|
863
|
-
no_confirm=yes,
|
|
864
|
-
serving=bool(task.serving),
|
|
865
|
-
)
|
|
866
870
|
click.secho(
|
|
867
|
-
ux_utils.command_hint_messages(ux_utils.CommandHintType.JOB,
|
|
871
|
+
ux_utils.command_hint_messages(ux_utils.CommandHintType.JOB, task.name),
|
|
868
872
|
fg='green',
|
|
869
873
|
bold=True,
|
|
870
874
|
)
|
|
@@ -1139,6 +1143,12 @@ def stop(
|
|
|
1139
1143
|
for job in track(jobs, description='Suspending job(s)...'):
|
|
1140
1144
|
jobset_utils.stop_jobset(namespace, job)
|
|
1141
1145
|
|
|
1146
|
+
click.secho(
|
|
1147
|
+
ux_utils.command_hint_messages(ux_utils.CommandHintType.JOB_STOP, jobs),
|
|
1148
|
+
fg='green',
|
|
1149
|
+
bold=True,
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1142
1152
|
|
|
1143
1153
|
@cli.command(cls=_DocumentedCodeCommand)
|
|
1144
1154
|
@click.argument(
|
|
@@ -1836,7 +1846,7 @@ def main():
|
|
|
1836
1846
|
return cli(standalone_mode=False)
|
|
1837
1847
|
except click.exceptions.Abort:
|
|
1838
1848
|
click.secho('Detaching...', fg='yellow', bold=True)
|
|
1839
|
-
return
|
|
1849
|
+
return None
|
|
1840
1850
|
|
|
1841
1851
|
|
|
1842
1852
|
if __name__ == '__main__':
|
|
@@ -75,12 +75,14 @@ def get_logger(name: str):
|
|
|
75
75
|
fh.setFormatter(FORMATTER)
|
|
76
76
|
logger.addHandler(fh)
|
|
77
77
|
|
|
78
|
-
# --- Console logging:
|
|
78
|
+
# --- Console logging: INFO level by default, DEBUG if KONDUKTOR_DEBUG=1 ---
|
|
79
|
+
ch = logging.StreamHandler()
|
|
79
80
|
if os.environ.get('KONDUKTOR_DEBUG') == '1':
|
|
80
|
-
ch = logging.StreamHandler()
|
|
81
81
|
ch.setLevel(logging.DEBUG)
|
|
82
|
-
|
|
83
|
-
|
|
82
|
+
else:
|
|
83
|
+
ch.setLevel(logging.INFO)
|
|
84
|
+
ch.setFormatter(FORMATTER)
|
|
85
|
+
logger.addHandler(ch)
|
|
84
86
|
|
|
85
87
|
logger.propagate = False
|
|
86
88
|
return logger
|
|
@@ -11,6 +11,9 @@ jobset:
|
|
|
11
11
|
{{ jobset_accelerator_label }}: "{{ accelerator_type }}"
|
|
12
12
|
{{ jobset_num_accelerators_label }}: "{{ num_accelerators }}"
|
|
13
13
|
{% endif %}
|
|
14
|
+
{% if max_execution_time %}
|
|
15
|
+
{{ jobset_max_execution_time_label }}: "{{ max_execution_time }}"
|
|
16
|
+
{% endif %}
|
|
14
17
|
trainy.ai/konduktor-managed: "true"
|
|
15
18
|
parent: "trainy"
|
|
16
19
|
annotations: {}
|
|
@@ -337,45 +337,51 @@ def tail_loki_logs_ws(
|
|
|
337
337
|
|
|
338
338
|
|
|
339
339
|
def tail_vicky_logs(
|
|
340
|
-
|
|
340
|
+
jobset_response: Dict[str, Any],
|
|
341
341
|
worker_id: int = 0,
|
|
342
|
-
num_logs: int =
|
|
342
|
+
num_logs: int = -1,
|
|
343
343
|
follow: bool = True,
|
|
344
344
|
):
|
|
345
|
+
job_name = jobset_response['metadata']['name']
|
|
345
346
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
346
347
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
347
348
|
query: Dict[str, Any] = {}
|
|
348
|
-
if num_logs > 5000:
|
|
349
|
-
# TODO(asaiacai): we should not have a limit on the number of logs, but rather
|
|
350
|
-
# let the user specify any number of lines, and we can print the last N lines.
|
|
351
|
-
# this can be done in chunks. Potentially, we can query range
|
|
352
|
-
# until we reach the end of the log and then invoke tail again.
|
|
353
|
-
# Also include checks that the job is running/ever ran.
|
|
354
|
-
raise ValueError('num_logs must be less than or equal to 5000')
|
|
355
|
-
logger.info('ignoring num_logs argument for VictoriaLogs')
|
|
356
349
|
vicky_svc = kr8s.objects.Service.get(
|
|
357
350
|
'vls-victoria-logs-single-server', namespace='victoria-logs'
|
|
358
351
|
)
|
|
352
|
+
|
|
353
|
+
if num_logs == -1:
|
|
354
|
+
query = {}
|
|
355
|
+
else:
|
|
356
|
+
assert num_logs > 0, f'num_logs must be greater than 0, got {num_logs}'
|
|
357
|
+
query = {'limit': num_logs}
|
|
358
|
+
if follow:
|
|
359
|
+
logger.info(
|
|
360
|
+
'No end time found, tailing logs from 1 hour ago. '
|
|
361
|
+
'If logs come up empty, there might be logs just earlier '
|
|
362
|
+
'than the past hour, check Grafana or use:\n'
|
|
363
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
|
|
364
|
+
f'`konduktor tail --no-follow {job_name}`'
|
|
365
|
+
f'{colorama.Style.RESET_ALL}'
|
|
366
|
+
)
|
|
367
|
+
query['start_offset'] = '1h'
|
|
368
|
+
query['query'] = (
|
|
369
|
+
f'k8s.namespace.name: "{namespace}" AND '
|
|
370
|
+
f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
|
|
371
|
+
f'batch.kubernetes.io/job-completion-index: "{worker_id}"'
|
|
372
|
+
)
|
|
373
|
+
|
|
359
374
|
with kr8s.portforward.PortForward(
|
|
360
375
|
vicky_svc, VICKY_REMOTE_PORT, local_port='auto'
|
|
361
376
|
) as port:
|
|
362
377
|
if follow:
|
|
363
378
|
timeout = INFINITY
|
|
364
379
|
vicky_url = f'http://localhost:{port}/select/logsql/tail'
|
|
365
|
-
query = {}
|
|
366
380
|
else:
|
|
367
381
|
vicky_url = f'http://localhost:{port}/select/logsql/query'
|
|
368
|
-
query = {'limit': num_logs}
|
|
369
382
|
timeout = 1
|
|
370
383
|
logger.debug(f'Vicky URL: {vicky_url}')
|
|
371
384
|
|
|
372
|
-
query['query'] = (
|
|
373
|
-
f'k8s.namespace.name: "{namespace}" AND '
|
|
374
|
-
f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
|
|
375
|
-
f'batch.kubernetes.io/job-completion-index: "{worker_id}"'
|
|
376
|
-
)
|
|
377
|
-
query['start_offset'] = '1h'
|
|
378
|
-
|
|
379
385
|
try:
|
|
380
386
|
logger.debug(f'Making request to {vicky_url} with query: {query}')
|
|
381
387
|
with requests.post(
|
|
@@ -412,16 +418,17 @@ def tail_vicky_logs(
|
|
|
412
418
|
|
|
413
419
|
|
|
414
420
|
def tail_logs(
|
|
415
|
-
|
|
421
|
+
jobset_response: Dict[str, Any],
|
|
416
422
|
worker_id: int = 0,
|
|
417
423
|
num_logs: int = 1000,
|
|
418
424
|
follow: bool = True,
|
|
419
425
|
):
|
|
426
|
+
job_name = jobset_response['metadata']['name']
|
|
420
427
|
logs_backend = config.get_nested(('logs', 'backend'), None)
|
|
421
428
|
if logs_backend == LogBackend.VICTORIA:
|
|
422
|
-
tail_vicky_logs(
|
|
429
|
+
tail_vicky_logs(jobset_response, worker_id, num_logs, follow)
|
|
423
430
|
elif logs_backend == LogBackend.LOKI:
|
|
424
431
|
tail_loki_logs_ws(job_name, worker_id, num_logs, follow)
|
|
425
432
|
else:
|
|
426
433
|
logger.info('Defaulting to VictoriaLogs')
|
|
427
|
-
tail_vicky_logs(
|
|
434
|
+
tail_vicky_logs(jobset_response, worker_id, num_logs, follow)
|
|
@@ -6,7 +6,7 @@ import os
|
|
|
6
6
|
import sys
|
|
7
7
|
import traceback
|
|
8
8
|
import typing
|
|
9
|
-
from typing import Callable, Optional, Union
|
|
9
|
+
from typing import Callable, List, Optional, Union
|
|
10
10
|
|
|
11
11
|
import colorama
|
|
12
12
|
import rich.console as rich_console
|
|
@@ -196,27 +196,41 @@ def spinner_message(
|
|
|
196
196
|
|
|
197
197
|
class CommandHintType(enum.Enum):
|
|
198
198
|
JOB = 'JOB'
|
|
199
|
+
JOB_STOP = 'JOB_STOP'
|
|
199
200
|
|
|
200
201
|
|
|
201
|
-
def command_hint_messages(
|
|
202
|
+
def command_hint_messages(
|
|
203
|
+
hint_type: CommandHintType,
|
|
204
|
+
job_id: Union[str, List[str]],
|
|
205
|
+
) -> str:
|
|
202
206
|
"""Gets the command hint messages for the given job id."""
|
|
207
|
+
hint_str = '\n📋 Useful Commands'
|
|
203
208
|
if hint_type == CommandHintType.JOB:
|
|
204
209
|
job_hint_str = (
|
|
205
210
|
f'\nJob ID: {job_id}'
|
|
206
|
-
f'\n{INDENT_SYMBOL}To cancel the job:\t\t'
|
|
207
|
-
f'{BOLD}konduktor down {job_id} {RESET_BOLD}'
|
|
208
211
|
f'\n{INDENT_SYMBOL}To stream job logs:\t\t'
|
|
209
212
|
f'{BOLD}konduktor logs {job_id} {RESET_BOLD}'
|
|
210
213
|
f'\n{INDENT_SYMBOL}To list all jobs:\t\t'
|
|
211
214
|
f'{BOLD}konduktor status{RESET_BOLD}'
|
|
215
|
+
f'\n{INDENT_SYMBOL}To suspend the job:\t\t'
|
|
216
|
+
f'{BOLD}konduktor stop {job_id} {RESET_BOLD}'
|
|
217
|
+
f'\n{INDENT_SYMBOL}{colorama.Fore.RED}To delete the job:\t\t'
|
|
218
|
+
f'{BOLD}konduktor down {job_id} {RESET_BOLD}{colorama.Style.RESET_ALL}'
|
|
212
219
|
)
|
|
213
|
-
hint_str = '\n📋 Useful Commands'
|
|
214
220
|
hint_str += f'{job_hint_str}'
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
+
elif hint_type == CommandHintType.JOB_STOP:
|
|
222
|
+
assert isinstance(job_id, list), 'job_id must be a list of strings'
|
|
223
|
+
job_ids_str = ' '.join(job_id)
|
|
224
|
+
hint_str += (
|
|
225
|
+
f'\n{INDENT_SYMBOL}To resume the following jobs:\t\t'
|
|
226
|
+
f'{BOLD}konduktor start {job_ids_str} {RESET_BOLD}'
|
|
227
|
+
)
|
|
221
228
|
else:
|
|
222
229
|
raise ValueError(f'Invalid hint type: {hint_type}')
|
|
230
|
+
|
|
231
|
+
if config.get_nested(('tailscale', 'secret_name'), None) is not None:
|
|
232
|
+
hint_str += (
|
|
233
|
+
f'\n{INDENT_SYMBOL}To tailscale ssh:\t\t'
|
|
234
|
+
f'{BOLD}ssh root@{job_id}-workers-0-0 {RESET_BOLD}'
|
|
235
|
+
)
|
|
236
|
+
return hint_str
|
{konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/LICENSE
RENAMED
|
File without changes
|
{konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|