PyPI - konduktor-nightly - Versions diffs - 0.1.0.dev20250804105449__py3-none-any.whl → 0.1.0.dev20250806105405__py3-none-any.whl - Mend

konduktor-nightly 0.1.0.dev20250804105449py3-none-any.whl → 0.1.0.dev20250806105405py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of konduktor-nightly might be problematic. Click here for more details.

Files changed (26) hide show

konduktor/__init__.py +4 -7
konduktor/backends/__init__.py +2 -4
konduktor/backends/constants.py +12 -0
konduktor/backends/deployment.py +179 -0
konduktor/backends/deployment_utils.py +835 -0
konduktor/backends/jobset.py +2 -2
konduktor/backends/jobset_utils.py +16 -266
konduktor/backends/pod_utils.py +392 -0
konduktor/cli.py +343 -8
konduktor/controller/launch.py +1 -1
konduktor/execution.py +5 -2
konduktor/kube_client.py +8 -0
konduktor/resource.py +20 -0
konduktor/serving.py +149 -0
konduktor/task.py +61 -0
konduktor/templates/deployment.yaml.j2 +142 -0
konduktor/templates/pod.yaml.j2 +36 -0
konduktor/utils/accelerator_registry.py +1 -1
konduktor/utils/log_utils.py +1 -1
konduktor/utils/schemas.py +42 -0
konduktor/utils/validator.py +51 -16
{konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/METADATA +1 -1
{konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/RECORD +26 -21
{konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/LICENSE +0 -0
{konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/WHEEL +0 -0
{konduktor_nightly-0.1.0.dev20250804105449.dist-info → konduktor_nightly-0.1.0.dev20250806105405.dist-info}/entry_points.txt +0 -0

konduktor/backends/jobset.py CHANGED Viewed

@@ -15,7 +15,7 @@ if typing.TYPE_CHECKING:
     from konduktor.data import storage as storage_lib
 from konduktor import config, logging
-from konduktor.backends import backend, jobset_utils
+from konduktor.backends import backend, jobset_utils, pod_utils
 from konduktor.utils import kubernetes_utils, log_utils, rich_utils, ux_utils
 Path = str
@@ -172,7 +172,7 @@ class JobsetBackend(backend.Backend):
         # the working container starts.
         # first define the pod spec then create the jobset definition
-        pod_spec = jobset_utils.create_pod_spec(task)
+        pod_spec = pod_utils.create_pod_spec(task)
         context = kubernetes_utils.get_current_kube_config_context_name()
         namespace = kubernetes_utils.get_kube_config_context_namespace(context)
         # TODO(asaiacai): need to set env variables in pod

konduktor/backends/jobset_utils.py CHANGED Viewed

@@ -1,32 +1,25 @@
 """Jobset utils: wraps CRUD operations for jobsets"""
-import base64
 import enum
 import json
-import os
 import tempfile
 import typing
 from datetime import datetime, timezone
 from typing import Any, Dict, Optional, Tuple
-from urllib.parse import urlparse
-import click
 import colorama
 if typing.TYPE_CHECKING:
     from datetime import timedelta
 import konduktor
-from konduktor import authentication, config, constants, kube_client, logging
+from konduktor import kube_client, logging
 from konduktor.backends import constants as backend_constants
-from konduktor.data import registry
+from konduktor.backends import pod_utils
 from konduktor.utils import (
     common_utils,
-    exceptions,
     kubernetes_utils,
     log_utils,
-    ux_utils,
-    validator,
 )
 if typing.TYPE_CHECKING:
@@ -38,13 +31,14 @@ JOBSET_API_GROUP = 'jobset.x-k8s.io'
 JOBSET_API_VERSION = 'v1alpha2'
 JOBSET_PLURAL = 'jobsets'
-JOBSET_NAME_LABEL = 'trainy.ai/job-name'
-JOBSET_USERID_LABEL = 'trainy.ai/user-id'
-JOBSET_USER_LABEL = 'trainy.ai/username'
-JOBSET_ACCELERATOR_LABEL = 'trainy.ai/accelerator'
-JOBSET_NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
+# Use shared constants from konduktor.backends.constants
+JOBSET_NAME_LABEL = backend_constants.JOB_NAME_LABEL
+JOBSET_USERID_LABEL = backend_constants.USERID_LABEL
+JOBSET_USER_LABEL = backend_constants.USER_LABEL
+JOBSET_ACCELERATOR_LABEL = backend_constants.ACCELERATOR_LABEL
+JOBSET_NUM_ACCELERATORS_LABEL = backend_constants.NUM_ACCELERATORS_LABEL
-SECRET_BASENAME_LABEL = 'konduktor/basename'
+SECRET_BASENAME_LABEL = backend_constants.SECRET_BASENAME_LABEL
 _JOBSET_METADATA_LABELS = {
     'jobset_name_label': JOBSET_NAME_LABEL,
@@ -54,8 +48,6 @@ _JOBSET_METADATA_LABELS = {
     'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
 }
-_RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
 class JobNotFoundError(Exception):
     pass
@@ -73,236 +65,6 @@ if typing.TYPE_CHECKING:
     import konduktor
-def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
-    """Merges the task defintion with config
-    to create a final pod spec dict for the job
-    Returns:
-        Dict[str, Any]: k8s pod spec
-    """
-    context = kubernetes_utils.get_current_kube_config_context_name()
-    namespace = kubernetes_utils.get_kube_config_context_namespace(context)
-    # fill out the templating variables
-    assert task.resources is not None, 'Task resources are required'
-    if task.resources.accelerators:
-        num_gpus = list(task.resources.accelerators.values())[0]
-    else:
-        num_gpus = 0
-    task.name = f'{task.name}-{common_utils.get_usage_run_id()[:4]}'
-    node_hostnames = ','.join(
-        [f'{task.name}-workers-0-{idx}.{task.name}' for idx in range(task.num_nodes)]
-    )
-    master_addr = f'{task.name}-workers-0-0.{task.name}'
-    if task.resources.accelerators:
-        accelerator_type = list(task.resources.accelerators.keys())[0]
-    else:
-        accelerator_type = None
-    assert task.resources.cpus is not None, 'Task resources cpus are required'
-    assert task.resources.memory is not None, 'Task resources memory are required'
-    assert task.resources.image_id is not None, 'Task resources image_id are required'
-    # template the commands to run on the container for syncing files. At this point
-    # task.stores is Dict[str, storage_utils.Storage] which is (dst, storage_obj_src)
-    # first we iterate through storage_mounts and then file_mounts.
-    sync_commands = []
-    mkdir_commands = []
-    storage_secrets = {}
-    # first do storage_mount sync
-    for dst, store in task.storage_mounts.items():
-        # TODO(asaiacai) idk why but theres an extra storage mount for the
-        # file mounts. Should be cleaned up eventually in
-        # maybe_translate_local_file_mounts_and_sync_up
-        assert store.source is not None and isinstance(
-            store.source, str
-        ), 'Store source is required'
-        store_scheme = urlparse(store.source).scheme
-        if '/tmp/konduktor-job-filemounts-files' in dst:
-            continue
-        # should impelement a method here instead of raw dog dict access
-        cloud_store = registry._REGISTRY[store_scheme]
-        storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
-        exists, _ = kubernetes_utils.check_secret_exists(
-            storage_secrets[store_scheme], namespace=namespace, context=context
-        )
-        assert exists, (
-            f"secret {storage_secrets[store_scheme]} doesn't "
-            f'exist in namespace {namespace}'
-        )
-        mkdir_commands.append(
-            f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};' f'mkdir -p {dst}'
-        )
-        assert store._bucket_sub_path is not None
-        sync_commands.append(
-            cloud_store.make_sync_dir_command(
-                os.path.join(store.source, store._bucket_sub_path), dst
-            )
-        )
-    # then do file_mount sync.
-    assert task.file_mounts is not None
-    for dst, src in task.file_mounts.items():
-        store_scheme = str(urlparse(store.source).scheme)
-        cloud_store = registry._REGISTRY[store_scheme]
-        mkdir_commands.append(
-            f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};'
-            f'mkdir -p {os.path.dirname(dst)}'
-        )
-        storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
-        exists, reason = kubernetes_utils.check_secret_exists(
-            storage_secrets[store_scheme], namespace=namespace, context=context
-        )
-        assert exists, (
-            f'secret {storage_secrets[store_scheme]} '
-            f"doesn't exist in namespace {namespace}"
-        )
-        sync_commands.append(cloud_store.make_sync_file_command(src, dst))
-    tailscale_secret = config.get_nested(('tailscale', 'secret_name'), None)
-    if tailscale_secret:
-        secret_exist, err = kubernetes_utils.check_secret_exists(
-            tailscale_secret, namespace, context
-        )
-        if not secret_exist:
-            with ux_utils.print_exception_no_traceback():
-                raise exceptions.MissingSecretError(
-                    f'No tailscale auth-key secret `{tailscale_secret}` found even '
-                    f'though specified by `tailscale.secret_name`: {err}'
-                )
-    enable_ssh = config.get_nested(('ssh', 'enable'), False) or tailscale_secret
-    secret_name = None
-    if enable_ssh:
-        private_key_path, public_key_path = authentication.get_or_generate_keys()
-        with (
-            open(private_key_path, 'rb') as private_key_file,
-            open(public_key_path, 'rb') as public_key_file,
-        ):
-            private_key, public_key = private_key_file.read(), public_key_file.read()
-            user_hash = common_utils.get_user_hash()
-            secret_name = f'konduktor-ssh-keys-{user_hash}'
-            ok, result = kubernetes_utils.set_secret(
-                secret_name=secret_name,
-                namespace=namespace,
-                context=context,
-                data={
-                    'PUBKEY': base64.b64encode(public_key).decode(),
-                    'PRIVKEY': base64.b64encode(private_key).decode(),
-                },
-            )
-            if not ok:
-                raise exceptions.CreateSecretError(
-                    f'Failed to set k8s secret {secret_name}: \n{result}'
-                )
-    # Mount the user's secrets
-    git_ssh_secret_name = None
-    env_secret_envs = []
-    default_secrets = []
-    user_hash = common_utils.get_user_hash()
-    label_selector = f'konduktor/owner={user_hash}'
-    user_secrets = kubernetes_utils.list_secrets(
-        namespace, context, label_filter=label_selector
-    )
-    for secret in user_secrets:
-        kind = kubernetes_utils.get_secret_kind(secret)
-        if kind == 'git-ssh' and git_ssh_secret_name is None:
-            git_ssh_secret_name = secret.metadata.name
-        elif kind == 'env':
-            env_secret_name = secret.metadata.name
-            key = next(iter(secret.data))
-            env_secret_envs.append(
-                {
-                    'name': key,
-                    'valueFrom': {
-                        'secretKeyRef': {'name': env_secret_name, 'key': key}
-                    },
-                }
-            )
-        elif kind == 'default':
-            default_secret_name = secret.metadata.name
-            basename = secret.metadata.labels.get(
-                SECRET_BASENAME_LABEL, default_secret_name
-            )
-            default_secrets.append(
-                {'k8s_name': default_secret_name, 'mount_name': basename}
-            )
-    with tempfile.NamedTemporaryFile() as temp:
-        common_utils.fill_template(
-            'pod.yaml.j2',
-            {
-                # TODO(asaiacai) need to parse/round these numbers and sanity check
-                'cpu': kubernetes_utils.parse_cpu_or_gpu_resource(task.resources.cpus),
-                'memory': kubernetes_utils.parse_memory_resource(task.resources.memory),
-                'image_id': task.resources.image_id,
-                'num_gpus': num_gpus,
-                'master_addr': master_addr,
-                'num_nodes': task.num_nodes,
-                'job_name': task.name,  # append timestamp and user id here?
-                'setup_cmd': task.setup or '',
-                'run_cmd': task.run,
-                'node_hostnames': node_hostnames,
-                'accelerator_type': accelerator_type,
-                'sync_commands': sync_commands,
-                'mkdir_commands': mkdir_commands,
-                'mount_secrets': storage_secrets,
-                'remote_workdir': constants.KONDUKTOR_REMOTE_WORKDIR,
-                'user': common_utils.get_cleaned_username(),
-                # Tailscale credentials
-                'tailscale_secret': tailscale_secret,
-                # SSH
-                'enable_ssh': enable_ssh,
-                'secret_name': secret_name,
-                'konduktor_ssh_port': backend_constants.KONDUKTOR_SSH_PORT,
-                # Kinds of Secrets
-                # --kind git-ssh
-                'git_ssh': git_ssh_secret_name,
-                # --kind default
-                'default_secrets': default_secrets,
-                # KONDUKTOR_DEBUG
-                'konduktor_debug': os.getenv('KONDUKTOR_DEBUG', 0),
-            },
-            temp.name,
-        )
-        pod_config = common_utils.read_yaml(temp.name)
-        # merge with `~/.konduktor/config.yaml``
-        kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
-        pod_config = common_utils.read_yaml(temp.name)
-    # Priority order: task.envs > secret envs > existing pod_config envs
-    existing_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
-        'env', []
-    )
-    env_map = {env['name']: env for env in existing_envs}
-    # Inject secret envs
-    for env in env_secret_envs:
-        env_map[env['name']] = env
-    # Inject task.envs
-    for k, v in task.envs.items():
-        env_map[k] = {'name': k, 'value': v}
-    # Replace the container's env section with the merged and prioritized map
-    pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = list(
-        env_map.values()
-    )
-    logger.debug(f'rendered pod spec: \n\t{pod_config}')
-    # validate pod spec using json schema
-    try:
-        validator.validate_pod_spec(pod_config['kubernetes']['pod_config']['spec'])
-    except ValueError as e:
-        raise click.UsageError(str(e))
-    return pod_config
 def create_jobset(
     namespace: str,
     task: 'konduktor.Task',
@@ -313,12 +75,8 @@ def create_jobset(
     and returns the created jobset spec
     """
     assert task.resources is not None, 'Task resources are undefined'
-    if task.resources.accelerators:
-        accelerator_type = list(task.resources.accelerators.keys())[0]
-        num_accelerators = list(task.resources.accelerators.values())[0]
-    else:
-        accelerator_type = 'None'
-        num_accelerators = 0
+    accelerator_type = task.resources.get_accelerator_type() or 'None'
+    num_accelerators = task.resources.get_accelerator_count() or 0
     with tempfile.NamedTemporaryFile() as temp:
         common_utils.fill_template(
             'jobset.yaml.j2',
@@ -336,19 +94,11 @@ def create_jobset(
             temp.name,
         )
         jobset_spec = common_utils.read_yaml(temp.name)
-        jobset_spec['jobset']['metadata']['labels'].update(
-            **(task.resources.labels or {})
-        )
-        assert task.resources.labels is not None
-        maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
-        if not maxRunDurationSeconds:
-            raise ValueError('maxRunDurationSeconds is required')
-        jobset_spec['jobset']['metadata']['annotations'][
-            _RUN_DURATION_ANNOTATION_KEY
-        ] = str(maxRunDurationSeconds)
-    jobset_spec['jobset']['spec']['replicatedJobs'][0]['template']['spec'][
-        'template'
-    ] = pod_spec  # noqa: E501
+        # Inject JobSet metadata (labels and annotations)
+        pod_utils.inject_jobset_metadata(jobset_spec, task)
+    # Merge pod spec into JobSet template
+    pod_utils.merge_pod_into_jobset_template(jobset_spec, pod_spec)
     try:
         context = kubernetes_utils.get_current_kube_config_context_name()
         jobset = kube_client.crd_api(context=context).create_namespaced_custom_object(

konduktor-nightly 0.1.0.dev20250804105449__py3-none-any.whl → 0.1.0.dev20250806105405__py3-none-any.whl

Potentially problematic release.

konduktor-nightly 0.1.0.dev20250804105449py3-none-any.whl → 0.1.0.dev20250806105405py3-none-any.whl