PyPI - konduktor-nightly - Versions diffs - 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl - Mend

konduktor-nightly 0.1.0.dev20250209104336py3-none-any.whl → 0.1.0.dev20250313070642py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

konduktor/__init__.py +16 -6
konduktor/adaptors/__init__.py +0 -0
konduktor/adaptors/common.py +88 -0
konduktor/adaptors/gcp.py +112 -0
konduktor/backends/__init__.py +8 -0
konduktor/backends/backend.py +86 -0
konduktor/backends/jobset.py +218 -0
konduktor/backends/jobset_utils.py +447 -0
konduktor/check.py +192 -0
konduktor/cli.py +790 -0
konduktor/cloud_stores.py +158 -0
konduktor/config.py +420 -0
konduktor/constants.py +36 -0
konduktor/controller/constants.py +6 -6
konduktor/controller/launch.py +3 -3
konduktor/controller/node.py +5 -5
konduktor/controller/parse.py +23 -23
konduktor/dashboard/backend/main.py +57 -57
konduktor/dashboard/backend/sockets.py +19 -19
konduktor/data/__init__.py +9 -0
konduktor/data/constants.py +12 -0
konduktor/data/data_utils.py +223 -0
konduktor/data/gcp/__init__.py +19 -0
konduktor/data/gcp/constants.py +42 -0
konduktor/data/gcp/gcs.py +906 -0
konduktor/data/gcp/utils.py +9 -0
konduktor/data/storage.py +799 -0
konduktor/data/storage_utils.py +500 -0
konduktor/execution.py +444 -0
konduktor/kube_client.py +153 -48
konduktor/logging.py +49 -5
konduktor/manifests/dmesg_daemonset.yaml +8 -0
konduktor/manifests/pod_cleanup_controller.yaml +129 -0
konduktor/resource.py +478 -0
konduktor/task.py +867 -0
konduktor/templates/jobset.yaml.j2 +31 -0
konduktor/templates/pod.yaml.j2 +185 -0
konduktor/usage/__init__.py +0 -0
konduktor/usage/constants.py +21 -0
konduktor/utils/__init__.py +0 -0
konduktor/utils/accelerator_registry.py +21 -0
konduktor/utils/annotations.py +62 -0
konduktor/utils/base64_utils.py +93 -0
konduktor/utils/common_utils.py +393 -0
konduktor/utils/constants.py +5 -0
konduktor/utils/env_options.py +55 -0
konduktor/utils/exceptions.py +226 -0
konduktor/utils/kubernetes_enums.py +8 -0
konduktor/utils/kubernetes_utils.py +652 -0
konduktor/utils/log_utils.py +251 -0
konduktor/utils/loki_utils.py +85 -0
konduktor/utils/rich_utils.py +123 -0
konduktor/utils/schemas.py +581 -0
konduktor/utils/subprocess_utils.py +273 -0
konduktor/utils/ux_utils.py +216 -0
konduktor/utils/validator.py +20 -0
{konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
{konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
{konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
{konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0

konduktor/templates/jobset.yaml.j2 ADDED Viewed

@@ -0,0 +1,31 @@
+jobset:
+  apiVersion: jobset.x-k8s.io/v1alpha2
+  kind: JobSet
+  metadata:
+    name: {{ job_name }}
+    labels:
+      {{ jobset_name_label }}: "{{ job_name }}"
+      {{ jobset_userid_label }}: "{{ user_id }}"
+      {{ jobset_user_label }}: "{{ user }}"
+      {% if accelerator_type %}
+      {{ jobset_accelerator_label }}: "{{ accelerator_type }}"
+      {{ jobset_num_accelerators_label }}: "{{ num_accelerators }}"
+      {% endif %}
+      trainy.ai/konduktor-managed: "true"
+      parent: "trainy"
+  spec:
+    ttlSecondsAfterFinished: 259200 # 3 days
+    replicatedJobs:
+    - name: workers
+      template:
+        spec:
+          ttlSecondsAfterFinished: 600 # 5 minutes
+          parallelism: {{ num_nodes }}
+          completions: {{ num_nodes }}
+          backoffLimit: 0
+          template: {}
+          podFailurePolicy:
+            rules:
+            - action: FailJob
+              onPodConditions:
+              - type: ConfigIssue

konduktor/templates/pod.yaml.j2 ADDED Viewed

@@ -0,0 +1,185 @@
+kubernetes:
+  pod_config:
+    metadata:
+      {% if accelerator_type %}
+      labels:
+        parent: trainy
+        trainy.ai/accelerator: {{ accelerator_type }}
+      {% endif %}
+    spec:
+      restartPolicy: "Never"
+      # trigger this on GPU request
+      {% if num_gpus > 0 %}
+      tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+      {% endif %}
+      containers:
+        # TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
+        # TODO(asaiacai): add ulimits
+        - name: konduktor-container
+          image: {{ image_id }}
+          # this is set during jobset definition since we need to know the jobset
+          # name and number of nodes to set all the environment variables correctly here
+          # as well as the additional from the job definition
+          env:
+          # flush logs immediately to stdout for more reactive log streaming
+          - name: PYTHONUNBUFFERED
+            value: "0"
+          - name: NODE_HOST_IPS
+            value: "{{ node_hostnames }}"
+          - name: MASTER_ADDR
+            value: "{{ master_addr }}"
+          - name: RANK
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+          - name: NUM_NODES
+            value: "{{ num_nodes }}"
+          - name: NUM_GPUS_PER_NODE
+            value: "{{ num_gpus }}"
+          # these are for compatibility with skypilot
+          - name: SKYPILOT_NODE_IPS
+            value: "{{ node_hostnames }}"
+          - name: SKYPILOT_NODE_RANK
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+          - name: SKYPILOT_NUM_NODES
+            value: "{{ num_nodes }}"
+          - name: SKYPILOT_NUM_GPUS_PER_NODE
+            value: "{{ num_gpus }}"
+          volumeMounts:
+          - name: shared-memory
+            mountPath: /dev/shm
+          {% for secret_type, secret_name in mount_secrets.items() %}
+          - name: {{ secret_type }}-secret
+            mountPath: /run/konduktor/{{ secret_type }}-secret
+          {% endfor %}
+          command: ["bash", "-c"]
+          # TODO(asaiacai): should we just mount this as a configmap instead? - Edit: probably not
+          args:
+            - |
+              # TODO(asaiacai): add debug environment variable for printing the apt-update, apt-install, sync-files output
+              # Helper function to conditionally use sudo
+              set -eo pipefail
+              mkdir -p ~/.konduktor/tmp
+              start_epoch=$(date +%s);
+              start_setup=$(date +%s);
+              echo "===== KONDUKTOR: Running setup and installing packages ====="
+              prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
+              [ $(id -u) -eq 0 ] && function sudo() { "$@"; } || true;
+              # Run apt update, install missing packages
+              DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > ~/.konduktor/tmp/apt-update.log 2>&1 || \
+              $(prefix_cmd) echo "Warning: apt-get update failed. Continuing anyway..." >> ~/.konduktor/tmp/apt-update.log
+              PACKAGES="rsync curl";
+              {% if 'gs' in mount_secrets %}
+              PACKAGES="$PACKAGES unzip wget";
+              {% endif %}
+              {% if 'git' in run_cmd %}
+              PACKAGES="$PACKAGES git";
+              {% endif %}
+              # Separate packages into two groups: packages that are installed first
+              # so that curl and rsync are available sooner to unblock the following
+              # conda installation and rsync.
+              INSTALL_FIRST="";
+              MISSING_PACKAGES="";
+              for pkg in $PACKAGES; do
+                if ! dpkg -l | grep -q "^ii  $pkg "; then
+                  if [ "$pkg" == "curl" ] || [ "$pkg" == "rsync" ]; then
+                    INSTALL_FIRST="$INSTALL_FIRST $pkg";
+                  else
+                    MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
+                  fi
+                fi
+              done;
+              if [ ! -z "$INSTALL_FIRST" ]; then
+                $(prefix_cmd) echo "Installing core packages: $INSTALL_FIRST";
+                DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $INSTALL_FIRST >> ~/.konduktor/tmp/apt-install.log;
+              fi;
+              if [ ! -z "$MISSING_PACKAGES" ]; then
+                $(prefix_cmd) echo "Installing missing packages: $MISSING_PACKAGES";
+                DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES >> ~/.konduktor/tmp/apt-install.log;
+              fi;
+              end_epoch=$(date +%s);
+              $(prefix_cmd) echo "===== KONDUKTOR: Installing packages took $((end_epoch - start_epoch)) seconds ====="
+              # unpack secrets credentials
+              $(prefix_cmd) echo "===== KONDUKTOR: Unpacking secrets credentials ====="
+              start_epoch=$(date +%s);
+              mkdir -p ~/.konduktor
+              mkdir -p {{ remote_workdir }}
+              {% for secret_type, secret_name in mount_secrets.items() %}
+              {% if secret_type == "gs" %}
+              $(prefix_cmd) echo "Unpacking GCP secret"
+              $(prefix_cmd) mkdir -p ~/.config
+              $(prefix_cmd) unzip /run/konduktor/gs-secret/gcpcredentials -d ~/.config/gcloud
+              {% endif %}
+              {% endfor %}
+              end_epoch=$(date +%s);
+              $(prefix_cmd) echo "===== KONDUKTOR: Unpacking secrets credentials took $((end_epoch - start_epoch)) seconds ====="
+              # sync file mounts
+              {% for mkdir_command in mkdir_commands %}
+              $(prefix_cmd) {{ mkdir_command }}
+              {% endfor %}
+              {% if sync_commands|length > 0 %}
+              $(prefix_cmd) echo "===== KONDUKTOR: Syncing files ====="
+              start_epoch=$(date +%s);
+              {% for sync_command in sync_commands %}
+              $(prefix_cmd) {{ sync_command }} >> ~/.konduktor/tmp/sync-files.log
+              {% endfor %}
+              end_epoch=$(date +%s);
+              $(prefix_cmd) echo "===== KONDUKTOR: Syncing files took $((end_epoch - start_epoch)) seconds ====="
+              {% endif %}
+              end_epoch=$(date +%s);
+              end_setup_time=$((end_epoch - start_setup));
+              $(prefix_cmd) echo "===== KONDUKTOR: Initialization took $end_setup_time seconds ====="
+              # run task
+              $(prefix_cmd) cd {{ remote_workdir }}
+              set +eo pipefail
+              $(prefix_cmd) echo "===== KONDUKTOR: Running task ====="
+              start_epoch=$(date +%s);
+              {{ run_cmd | indent( width=14 ) }}
+              end_epoch=$(date +%s);
+              exit_code=$?
+              $(prefix_cmd) echo "===== KONDUKTOR: Running task took $((end_epoch - start_epoch)) seconds and finished with exit code: $exit_code ====="
+              exit $exit_code
+          resources:
+            limits:
+              cpu: {{ cpu }}
+              memory: {{ memory }}Gi
+              # TODO(asaiacai): need to decide whether we include fabric configuration here
+              {% if num_gpus > 0 %}
+              nvidia.com/gpu: {{ num_gpus }}
+              {% endif %}
+            requests:
+              cpu: {{ cpu }}
+              memory: {{ memory }}Gi
+              {% if num_gpus > 0 %}
+              nvidia.com/gpu: {{num_gpus}}
+              {% endif %}
+          securityContext:
+            capabilities:
+              add: ["IPC_LOCK"]  # May be needed for memlock
+      volumes:
+      - name: shared-memory
+        emptyDir:
+          medium: "Memory"
+          sizeLimit: 4Gi
+      {% for secret_type, secret_name in mount_secrets.items() %}
+      - name: {{ secret_type }}-secret
+        secret:
+          secretName: {{ secret_name }}
+      {% endfor %}
+      # TODO(asaiacai): should we add nodeSelectors here or leave to
+      # kueue resource flavors. leaning towards defining
+      # in kueue and just querying for the kueue resource flavor

konduktor/usage/__init__.py ADDED Viewed

File without changes

konduktor/usage/constants.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Constants for usage collection."""
+import os
+KONDUKTOR_DISABLE_USAGE_COLLECTION = os.environ.get(
+    'KONDUKTOR_DISABLE_USAGE_COLLECTION', False
+)
+POSTHOG_API_KEY = os.environ.get(
+    'POSTHOG_API_KEY', 'phc_4UgX80BfVNmYRZ2o3dJLyRMGkv1CxBozPAcPnD29uP4'
+)
+POSTHOG_HOST = os.environ.get('POSTHOG_HOST', 'https://us.i.posthog.com')
+USAGE_POLICY_MESSAGE = (
+    'Konduktor collects usage data to improve its services. '
+    '`run` commands are not collected to '
+    'ensure privacy.\n'
+    'Usage logging can be disabled by setting the '
+    'environment variable KONDUKTOR_DISABLE_USAGE_COLLECTION=1.'
+)

konduktor/utils/__init__.py ADDED Viewed

File without changes

konduktor/utils/accelerator_registry.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Accelerator registry."""
+_ACCELERATORS = [
+    'A100',
+    'A100-80GB',
+    'H100',
+]
+def canonicalize_accelerator_name(accelerator: str) -> str:
+    """Returns the canonical accelerator name."""
+    # Common case: do not read the catalog files.
+    mapping = {name.lower(): name for name in _ACCELERATORS}
+    if accelerator.lower() in mapping:
+        return mapping[accelerator.lower()]
+    raise ValueError(
+        f'Accelerator name {accelerator!r} is not supported. '
+        f'Please choose one of {_ACCELERATORS}.'
+    )

konduktor/utils/annotations.py ADDED Viewed

@@ -0,0 +1,62 @@
+# Proprietary Changes made for Trainy under the Trainy Software License
+# Original source: skypilot: https://github.com/skypilot-org/skypilot
+# which is Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Annotations for public APIs."""
+import functools
+from typing import Callable, Literal
+# Whether the current process is a SkyPilot API server process.
+is_on_api_server = True
+FUNCTIONS_NEED_RELOAD_CACHE = []
+def client_api(func):
+    """Mark a function as a client-side API.
+    Code invoked by server-side functions will find annotations.is_on_api_server
+    to be True, so they can have some server-side handling.
+    """
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        global is_on_api_server
+        is_on_api_server = False
+        return func(*args, **kwargs)
+    return wrapper
+def lru_cache(
+    scope: Literal['global', 'request'], *lru_cache_args, **lru_cache_kwargs
+) -> Callable:
+    """LRU cache decorator for functions.
+    This decorator allows us to track which functions need to be reloaded for a
+    new request using the scope argument.
+    Args:
+        scope: Whether the cache is global or request-specific, i.e. needs to be
+            reloaded for a new request.
+        lru_cache_args: Arguments for functools.lru_cache.
+        lru_cache_kwargs: Keyword arguments for functools.lru_cache.
+    """
+    def decorator(func: Callable) -> Callable:
+        if scope == 'global':
+            return functools.lru_cache(*lru_cache_args, **lru_cache_kwargs)(func)
+        else:
+            cached_func = functools.lru_cache(*lru_cache_args, **lru_cache_kwargs)(func)
+            FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
+            return cached_func
+    return decorator

konduktor/utils/base64_utils.py ADDED Viewed

@@ -0,0 +1,93 @@
+"""
+Utility for (un)zip and encode/decoding k8s secrets in base64
+"""
+import base64
+import os
+import shutil
+import tempfile
+import zipfile
+from typing import List
+def zip_base64encode(files: List[str]) -> str:
+    """Zips files and encodes them in base64.
+    Args:
+        files: List of file paths to zip. Can include files and directories.
+    Returns:
+        Base64 encoded string of the zipped files.
+    """
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Copy all files/directories to temp dir preserving structure
+        for file_path in files:
+            src_path = os.path.expanduser(file_path)
+            if not os.path.exists(src_path):
+                continue
+            dst_path = os.path.join(temp_dir, os.path.basename(file_path))
+            if os.path.isdir(src_path):
+                shutil.copytree(src_path, dst_path)
+            else:
+                shutil.copy2(src_path, dst_path)
+        # Create zip file
+        zip_path = os.path.join(temp_dir, 'archive.zip')
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            for item in os.listdir(temp_dir):
+                if item == 'archive.zip':
+                    continue
+                item_path = os.path.join(temp_dir, item)
+                if os.path.isfile(item_path):
+                    zipf.write(item_path, item)
+                else:
+                    for root, _, files in os.walk(item_path):
+                        for file in files:
+                            file_path = os.path.join(root, file)
+                            arcname = os.path.relpath(file_path, temp_dir)
+                            zipf.write(file_path, arcname)
+        # Read and encode zip file
+        with open(zip_path, 'rb') as f:
+            zip_str = f.read()
+            secret_value = base64.b64encode(zip_str).decode('utf-8')
+            # print("encoding")
+            # print(type(secret_value))
+            # print(len(secret_value))
+            # print(secret_value[-20:])
+            return secret_value
+def base64decode_unzip(secret_value: str, output_path: str) -> str:
+    """Decodes a base64 encoded string and unzips the files.
+    Args:
+        secret_value: Base64 encoded string of the zipped files.
+        output_path: Path where to extract the unzipped files.
+    Returns:
+        Path to the unzipped files.
+    """
+    # TODO(asaiacai): this is messy I know...
+    # Decode base64 string
+    # print("decoding")
+    # print(type(secret_value))
+    # print(len(secret_value))
+    # print(secret_value[-20:])
+    decoded_data = base64.b64decode(secret_value)
+    # Write decoded data to temporary zip file
+    with tempfile.TemporaryDirectory() as temp_dir:
+        zip_path = os.path.join(temp_dir, 'archive.zip')
+        with zipfile.ZipFile(zip_path, 'w') as zipf:
+            zipf.writestr('data.zip', decoded_data)
+        with zipfile.ZipFile(zip_path, 'r') as zipf:
+            zipf.extractall(path=output_path)
+        with zipfile.ZipFile(os.path.join(output_path, 'data.zip'), 'r') as zipf:
+            zipf.extractall(path=output_path)
+    return output_path

konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

konduktor-nightly 0.1.0.dev20250209104336py3-none-any.whl → 0.1.0.dev20250313070642py3-none-any.whl