konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,31 @@
1
+ jobset:
2
+ apiVersion: jobset.x-k8s.io/v1alpha2
3
+ kind: JobSet
4
+ metadata:
5
+ name: {{ job_name }}
6
+ labels:
7
+ {{ jobset_name_label }}: "{{ job_name }}"
8
+ {{ jobset_userid_label }}: "{{ user_id }}"
9
+ {{ jobset_user_label }}: "{{ user }}"
10
+ {% if accelerator_type %}
11
+ {{ jobset_accelerator_label }}: "{{ accelerator_type }}"
12
+ {{ jobset_num_accelerators_label }}: "{{ num_accelerators }}"
13
+ {% endif %}
14
+ trainy.ai/konduktor-managed: "true"
15
+ parent: "trainy"
16
+ spec:
17
+ ttlSecondsAfterFinished: 259200 # 3 days
18
+ replicatedJobs:
19
+ - name: workers
20
+ template:
21
+ spec:
22
+ ttlSecondsAfterFinished: 600 # 5 minutes
23
+ parallelism: {{ num_nodes }}
24
+ completions: {{ num_nodes }}
25
+ backoffLimit: 0
26
+ template: {}
27
+ podFailurePolicy:
28
+ rules:
29
+ - action: FailJob
30
+ onPodConditions:
31
+ - type: ConfigIssue
@@ -0,0 +1,185 @@
1
+ kubernetes:
2
+ pod_config:
3
+ metadata:
4
+ {% if accelerator_type %}
5
+ labels:
6
+ parent: trainy
7
+ trainy.ai/accelerator: {{ accelerator_type }}
8
+ {% endif %}
9
+ spec:
10
+ restartPolicy: "Never"
11
+ # trigger this on GPU request
12
+ {% if num_gpus > 0 %}
13
+ tolerations:
14
+ - key: "nvidia.com/gpu"
15
+ operator: "Exists"
16
+ {% endif %}
17
+ containers:
18
+ # TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
19
+ # TODO(asaiacai): add ulimits
20
+ - name: konduktor-container
21
+ image: {{ image_id }}
22
+ # this is set during jobset definition since we need to know the jobset
23
+ # name and number of nodes to set all the environment variables correctly here
24
+ # as well as the additional from the job definition
25
+ env:
26
+ # flush logs immediately to stdout for more reactive log streaming
27
+ - name: PYTHONUNBUFFERED
28
+ value: "0"
29
+ - name: NODE_HOST_IPS
30
+ value: "{{ node_hostnames }}"
31
+ - name: MASTER_ADDR
32
+ value: "{{ master_addr }}"
33
+ - name: RANK
34
+ valueFrom:
35
+ fieldRef:
36
+ fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
37
+ - name: NUM_NODES
38
+ value: "{{ num_nodes }}"
39
+ - name: NUM_GPUS_PER_NODE
40
+ value: "{{ num_gpus }}"
41
+ # these are for compatibility with skypilot
42
+ - name: SKYPILOT_NODE_IPS
43
+ value: "{{ node_hostnames }}"
44
+ - name: SKYPILOT_NODE_RANK
45
+ valueFrom:
46
+ fieldRef:
47
+ fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
48
+ - name: SKYPILOT_NUM_NODES
49
+ value: "{{ num_nodes }}"
50
+ - name: SKYPILOT_NUM_GPUS_PER_NODE
51
+ value: "{{ num_gpus }}"
52
+ volumeMounts:
53
+ - name: shared-memory
54
+ mountPath: /dev/shm
55
+ {% for secret_type, secret_name in mount_secrets.items() %}
56
+ - name: {{ secret_type }}-secret
57
+ mountPath: /run/konduktor/{{ secret_type }}-secret
58
+ {% endfor %}
59
+ command: ["bash", "-c"]
60
+ # TODO(asaiacai): should we just mount this as a configmap instead? - Edit: probably not
61
+ args:
62
+ - |
63
+ # TODO(asaiacai): add debug environment variable for printing the apt-update, apt-install, sync-files output
64
+ # Helper function to conditionally use sudo
65
+ set -eo pipefail
66
+ mkdir -p ~/.konduktor/tmp
67
+ start_epoch=$(date +%s);
68
+ start_setup=$(date +%s);
69
+ echo "===== KONDUKTOR: Running setup and installing packages ====="
70
+ prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
71
+ [ $(id -u) -eq 0 ] && function sudo() { "$@"; } || true;
72
+
73
+
74
+ # Run apt update, install missing packages
75
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > ~/.konduktor/tmp/apt-update.log 2>&1 || \
76
+ $(prefix_cmd) echo "Warning: apt-get update failed. Continuing anyway..." >> ~/.konduktor/tmp/apt-update.log
77
+ PACKAGES="rsync curl";
78
+ {% if 'gs' in mount_secrets %}
79
+ PACKAGES="$PACKAGES unzip wget";
80
+ {% endif %}
81
+ {% if 'git' in run_cmd %}
82
+ PACKAGES="$PACKAGES git";
83
+ {% endif %}
84
+
85
+ # Separate packages into two groups: packages that are installed first
86
+ # so that curl and rsync are available sooner to unblock the following
87
+ # conda installation and rsync.
88
+ INSTALL_FIRST="";
89
+ MISSING_PACKAGES="";
90
+ for pkg in $PACKAGES; do
91
+ if ! dpkg -l | grep -q "^ii $pkg "; then
92
+ if [ "$pkg" == "curl" ] || [ "$pkg" == "rsync" ]; then
93
+ INSTALL_FIRST="$INSTALL_FIRST $pkg";
94
+ else
95
+ MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
96
+ fi
97
+ fi
98
+ done;
99
+ if [ ! -z "$INSTALL_FIRST" ]; then
100
+ $(prefix_cmd) echo "Installing core packages: $INSTALL_FIRST";
101
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $INSTALL_FIRST >> ~/.konduktor/tmp/apt-install.log;
102
+ fi;
103
+
104
+ if [ ! -z "$MISSING_PACKAGES" ]; then
105
+ $(prefix_cmd) echo "Installing missing packages: $MISSING_PACKAGES";
106
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES >> ~/.konduktor/tmp/apt-install.log;
107
+ fi;
108
+ end_epoch=$(date +%s);
109
+ $(prefix_cmd) echo "===== KONDUKTOR: Installing packages took $((end_epoch - start_epoch)) seconds ====="
110
+
111
+ # unpack secrets credentials
112
+ $(prefix_cmd) echo "===== KONDUKTOR: Unpacking secrets credentials ====="
113
+ start_epoch=$(date +%s);
114
+ mkdir -p ~/.konduktor
115
+ mkdir -p {{ remote_workdir }}
116
+ {% for secret_type, secret_name in mount_secrets.items() %}
117
+ {% if secret_type == "gs" %}
118
+ $(prefix_cmd) echo "Unpacking GCP secret"
119
+ $(prefix_cmd) mkdir -p ~/.config
120
+ $(prefix_cmd) unzip /run/konduktor/gs-secret/gcpcredentials -d ~/.config/gcloud
121
+ {% endif %}
122
+ {% endfor %}
123
+ end_epoch=$(date +%s);
124
+ $(prefix_cmd) echo "===== KONDUKTOR: Unpacking secrets credentials took $((end_epoch - start_epoch)) seconds ====="
125
+
126
+ # sync file mounts
127
+ {% for mkdir_command in mkdir_commands %}
128
+ $(prefix_cmd) {{ mkdir_command }}
129
+ {% endfor %}
130
+ {% if sync_commands|length > 0 %}
131
+ $(prefix_cmd) echo "===== KONDUKTOR: Syncing files ====="
132
+ start_epoch=$(date +%s);
133
+ {% for sync_command in sync_commands %}
134
+ $(prefix_cmd) {{ sync_command }} >> ~/.konduktor/tmp/sync-files.log
135
+ {% endfor %}
136
+ end_epoch=$(date +%s);
137
+ $(prefix_cmd) echo "===== KONDUKTOR: Syncing files took $((end_epoch - start_epoch)) seconds ====="
138
+ {% endif %}
139
+ end_epoch=$(date +%s);
140
+ end_setup_time=$((end_epoch - start_setup));
141
+ $(prefix_cmd) echo "===== KONDUKTOR: Initialization took $end_setup_time seconds ====="
142
+ # run task
143
+ $(prefix_cmd) cd {{ remote_workdir }}
144
+ set +eo pipefail
145
+ $(prefix_cmd) echo "===== KONDUKTOR: Running task ====="
146
+ start_epoch=$(date +%s);
147
+ {{ run_cmd | indent( width=14 ) }}
148
+ end_epoch=$(date +%s);
149
+ exit_code=$?
150
+ $(prefix_cmd) echo "===== KONDUKTOR: Running task took $((end_epoch - start_epoch)) seconds and finished with exit code: $exit_code ====="
151
+ exit $exit_code
152
+ resources:
153
+ limits:
154
+ cpu: {{ cpu }}
155
+ memory: {{ memory }}Gi
156
+ # TODO(asaiacai): need to decide whether we include fabric configuration here
157
+ {% if num_gpus > 0 %}
158
+ nvidia.com/gpu: {{ num_gpus }}
159
+ {% endif %}
160
+ requests:
161
+ cpu: {{ cpu }}
162
+ memory: {{ memory }}Gi
163
+ {% if num_gpus > 0 %}
164
+ nvidia.com/gpu: {{num_gpus}}
165
+ {% endif %}
166
+ securityContext:
167
+ capabilities:
168
+ add: ["IPC_LOCK"] # May be needed for memlock
169
+
170
+ volumes:
171
+ - name: shared-memory
172
+ emptyDir:
173
+ medium: "Memory"
174
+ sizeLimit: 4Gi
175
+ {% for secret_type, secret_name in mount_secrets.items() %}
176
+ - name: {{ secret_type }}-secret
177
+ secret:
178
+ secretName: {{ secret_name }}
179
+ {% endfor %}
180
+
181
+
182
+
183
+ # TODO(asaiacai): should we add nodeSelectors here or leave to
184
+ # kueue resource flavors. leaning towards defining
185
+ # in kueue and just querying for the kueue resource flavor
File without changes
@@ -0,0 +1,21 @@
1
+ """Constants for usage collection."""
2
+
3
+ import os
4
+
5
+ KONDUKTOR_DISABLE_USAGE_COLLECTION = os.environ.get(
6
+ 'KONDUKTOR_DISABLE_USAGE_COLLECTION', False
7
+ )
8
+
9
+ POSTHOG_API_KEY = os.environ.get(
10
+ 'POSTHOG_API_KEY', 'phc_4UgX80BfVNmYRZ2o3dJLyRMGkv1CxBozPAcPnD29uP4'
11
+ )
12
+
13
+ POSTHOG_HOST = os.environ.get('POSTHOG_HOST', 'https://us.i.posthog.com')
14
+
15
+ USAGE_POLICY_MESSAGE = (
16
+ 'Konduktor collects usage data to improve its services. '
17
+ '`run` commands are not collected to '
18
+ 'ensure privacy.\n'
19
+ 'Usage logging can be disabled by setting the '
20
+ 'environment variable KONDUKTOR_DISABLE_USAGE_COLLECTION=1.'
21
+ )
File without changes
@@ -0,0 +1,21 @@
1
+ """Accelerator registry."""
2
+
3
+ _ACCELERATORS = [
4
+ 'A100',
5
+ 'A100-80GB',
6
+ 'H100',
7
+ ]
8
+
9
+
10
+ def canonicalize_accelerator_name(accelerator: str) -> str:
11
+ """Returns the canonical accelerator name."""
12
+
13
+ # Common case: do not read the catalog files.
14
+ mapping = {name.lower(): name for name in _ACCELERATORS}
15
+ if accelerator.lower() in mapping:
16
+ return mapping[accelerator.lower()]
17
+
18
+ raise ValueError(
19
+ f'Accelerator name {accelerator!r} is not supported. '
20
+ f'Please choose one of {_ACCELERATORS}.'
21
+ )
@@ -0,0 +1,62 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Annotations for public APIs."""
14
+
15
+ import functools
16
+ from typing import Callable, Literal
17
+
18
+ # Whether the current process is a SkyPilot API server process.
19
+ is_on_api_server = True
20
+ FUNCTIONS_NEED_RELOAD_CACHE = []
21
+
22
+
23
+ def client_api(func):
24
+ """Mark a function as a client-side API.
25
+
26
+ Code invoked by server-side functions will find annotations.is_on_api_server
27
+ to be True, so they can have some server-side handling.
28
+ """
29
+
30
+ @functools.wraps(func)
31
+ def wrapper(*args, **kwargs):
32
+ global is_on_api_server
33
+ is_on_api_server = False
34
+ return func(*args, **kwargs)
35
+
36
+ return wrapper
37
+
38
+
39
+ def lru_cache(
40
+ scope: Literal['global', 'request'], *lru_cache_args, **lru_cache_kwargs
41
+ ) -> Callable:
42
+ """LRU cache decorator for functions.
43
+
44
+ This decorator allows us to track which functions need to be reloaded for a
45
+ new request using the scope argument.
46
+
47
+ Args:
48
+ scope: Whether the cache is global or request-specific, i.e. needs to be
49
+ reloaded for a new request.
50
+ lru_cache_args: Arguments for functools.lru_cache.
51
+ lru_cache_kwargs: Keyword arguments for functools.lru_cache.
52
+ """
53
+
54
+ def decorator(func: Callable) -> Callable:
55
+ if scope == 'global':
56
+ return functools.lru_cache(*lru_cache_args, **lru_cache_kwargs)(func)
57
+ else:
58
+ cached_func = functools.lru_cache(*lru_cache_args, **lru_cache_kwargs)(func)
59
+ FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
60
+ return cached_func
61
+
62
+ return decorator
@@ -0,0 +1,93 @@
1
+ """
2
+ Utility for (un)zip and encode/decoding k8s secrets in base64
3
+ """
4
+
5
+ import base64
6
+ import os
7
+ import shutil
8
+ import tempfile
9
+ import zipfile
10
+ from typing import List
11
+
12
+
13
+ def zip_base64encode(files: List[str]) -> str:
14
+ """Zips files and encodes them in base64.
15
+
16
+ Args:
17
+ files: List of file paths to zip. Can include files and directories.
18
+
19
+ Returns:
20
+ Base64 encoded string of the zipped files.
21
+ """
22
+ with tempfile.TemporaryDirectory() as temp_dir:
23
+ # Copy all files/directories to temp dir preserving structure
24
+ for file_path in files:
25
+ src_path = os.path.expanduser(file_path)
26
+ if not os.path.exists(src_path):
27
+ continue
28
+ dst_path = os.path.join(temp_dir, os.path.basename(file_path))
29
+
30
+ if os.path.isdir(src_path):
31
+ shutil.copytree(src_path, dst_path)
32
+ else:
33
+ shutil.copy2(src_path, dst_path)
34
+
35
+ # Create zip file
36
+ zip_path = os.path.join(temp_dir, 'archive.zip')
37
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
38
+ for item in os.listdir(temp_dir):
39
+ if item == 'archive.zip':
40
+ continue
41
+ item_path = os.path.join(temp_dir, item)
42
+ if os.path.isfile(item_path):
43
+ zipf.write(item_path, item)
44
+ else:
45
+ for root, _, files in os.walk(item_path):
46
+ for file in files:
47
+ file_path = os.path.join(root, file)
48
+ arcname = os.path.relpath(file_path, temp_dir)
49
+ zipf.write(file_path, arcname)
50
+
51
+ # Read and encode zip file
52
+ with open(zip_path, 'rb') as f:
53
+ zip_str = f.read()
54
+ secret_value = base64.b64encode(zip_str).decode('utf-8')
55
+ # print("encoding")
56
+ # print(type(secret_value))
57
+ # print(len(secret_value))
58
+ # print(secret_value[-20:])
59
+ return secret_value
60
+
61
+
62
+ def base64decode_unzip(secret_value: str, output_path: str) -> str:
63
+ """Decodes a base64 encoded string and unzips the files.
64
+
65
+ Args:
66
+ secret_value: Base64 encoded string of the zipped files.
67
+ output_path: Path where to extract the unzipped files.
68
+
69
+ Returns:
70
+ Path to the unzipped files.
71
+ """
72
+ # TODO(asaiacai): this is messy I know...
73
+ # Decode base64 string
74
+ # print("decoding")
75
+ # print(type(secret_value))
76
+ # print(len(secret_value))
77
+ # print(secret_value[-20:])
78
+ decoded_data = base64.b64decode(secret_value)
79
+
80
+ # Write decoded data to temporary zip file
81
+ with tempfile.TemporaryDirectory() as temp_dir:
82
+ zip_path = os.path.join(temp_dir, 'archive.zip')
83
+
84
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
85
+ zipf.writestr('data.zip', decoded_data)
86
+
87
+ with zipfile.ZipFile(zip_path, 'r') as zipf:
88
+ zipf.extractall(path=output_path)
89
+
90
+ with zipfile.ZipFile(os.path.join(output_path, 'data.zip'), 'r') as zipf:
91
+ zipf.extractall(path=output_path)
92
+
93
+ return output_path