konduktor-nightly 0.1.0.dev20250519104943__py3-none-any.whl → 0.1.0.dev20250521104900__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
konduktor/__init__.py CHANGED
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = '709675de56c5bb11cf4131c48e0fdfc96c67decf'
17
+ _KONDUKTOR_COMMIT_SHA = '46626c8c0df4bbbd5a8fc164b2d2f66c26dbdd33'
18
18
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
19
19
 
20
20
 
@@ -48,5 +48,5 @@ def _get_git_commit():
48
48
 
49
49
 
50
50
  __commit__ = _get_git_commit()
51
- __version__ = '1.0.0.dev0.1.0.dev20250519104943'
51
+ __version__ = '1.0.0.dev0.1.0.dev20250521104900'
52
52
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -15,36 +15,6 @@ kubernetes:
15
15
  - key: "nvidia.com/gpu"
16
16
  operator: "Exists"
17
17
  {% endif %}
18
- initContainers:
19
- - name: setup-synchronizer
20
- image: "alpine:3.19"
21
- restartPolicy: Always
22
- command: ["/bin/sh", "-c"]
23
- args:
24
- - |
25
- apk add --no-cache socat
26
- wget https://raw.githubusercontent.com/asaiacai/dumb_barrier/refs/heads/main/dumb_barrier.sh
27
- sh -x dumb_barrier.sh
28
- volumeMounts:
29
- - name: sync
30
- mountPath: /tmp/konduktor
31
- env:
32
- - name: MASTER_ADDR
33
- value: "{{ master_addr }}"
34
- - name: RANK
35
- valueFrom:
36
- fieldRef:
37
- fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
38
- - name: WORLD_SIZE
39
- value: "{{ num_nodes }}"
40
- - name: MASTER_PORT
41
- value: "11111"
42
- - name: GO_PORT
43
- value: "11112"
44
- - name: POD_NAMESPACE
45
- valueFrom:
46
- fieldRef:
47
- fieldPath: metadata.namespace
48
18
  containers:
49
19
  # TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
50
20
  - name: konduktor-container
@@ -327,33 +297,6 @@ kubernetes:
327
297
  ulimit -Sc 0 && ulimit -Hc 0
328
298
  $(prefix_cmd) echo "===== KONDUKTOR: Initialization took $end_setup_time seconds ====="
329
299
  set +eo pipefail
330
- $(prefix_cmd) cd {{ remote_workdir }}
331
- {% if setup_cmd %}
332
- # setup task
333
- $(prefix_cmd) echo "===== KONDUKTOR: Running setup ======="
334
- {{ setup_cmd | indent( width=14 ) }}
335
- {% endif %}
336
-
337
- # synchronize workers before executing `run`
338
- set -e
339
- touch "/tmp/konduktor/SETUP"
340
- # TODO(asaiacai): should we make this value tuneable for users?
341
- TIMEOUT=300
342
- start_sync=$(date +%s);
343
- DEADLINE=$(( $(date +%s) + TIMEOUT ))
344
-
345
- echo "[KONDUKTOR: main] Waiting for workers to synchronize"
346
- while [ ! -f "/tmp/konduktor/READY" ]; do
347
- if [ "$(date +%s)" -ge "$DEADLINE" ]; then
348
- echo "[KONDUKTOR: main] ERROR: Timed out after 2 minutes of waiting for worker synchronization"
349
- exit 1
350
- fi
351
- sleep 0.5
352
- done
353
- echo "[KONDUKTOR: main] All workers have joined"
354
- end_sync=$(date +%s);
355
- echo "[KONDUKTOR: main] Synchronization took $((end_sync - start_sync)) seconds"
356
- set +eo pipefail
357
300
  # run task
358
301
  $(prefix_cmd) cd {{ remote_workdir }}
359
302
  $(prefix_cmd) echo "===== KONDUKTOR: Running task ====="
@@ -395,8 +338,6 @@ kubernetes:
395
338
  secret:
396
339
  secretName: {{ secret_name }}
397
340
  {% endfor %}
398
-
399
-
400
341
 
401
342
  # TODO(asaiacai): should we add nodeSelectors here or leave to
402
343
  # kueue resource flavors. leaning towards defining
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250519104943
3
+ Version: 0.1.0.dev20250521104900
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -1,4 +1,4 @@
1
- konduktor/__init__.py,sha256=R1b8oEBavaV7tqJx9KqKstRMivzsCoXfSblRpMwSmc4,1540
1
+ konduktor/__init__.py,sha256=BpU070cC1XqK9i4AWlWvYixgyNGfMsI40UCLdAsI8us,1540
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
4
4
  konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
@@ -71,7 +71,7 @@ konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw
71
71
  konduktor/resource.py,sha256=w2PdIrmQaJWA-GLSmVBcg4lxwuxvPulz35_YSKa5o24,19254
72
72
  konduktor/task.py,sha256=ofwd8WIhfD6C3ThLcv6X3GUzQHyZ6ddjUagE-umF4K0,35207
73
73
  konduktor/templates/jobset.yaml.j2,sha256=onYiHtXAgk-XBtji994hPu_g0hxnLzvmfxwjbdKdeZc,960
74
- konduktor/templates/pod.yaml.j2,sha256=TNwVnadENnLmYa753yNlvyjmfkibHviaOu3OkCmz6mo,17778
74
+ konduktor/templates/pod.yaml.j2,sha256=v0s_gLmr7bBMVtdiElHbBvt36bJzAxBptqWVLzvdvE4,15520
75
75
  konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
76
  konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
77
77
  konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -91,8 +91,8 @@ konduktor/utils/schemas.py,sha256=2fHsTi3t9q3LXqOPrcpkmPsMbaoJBnuJstd6ULmDiUo,16
91
91
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
92
92
  konduktor/utils/ux_utils.py,sha256=czCwiS1bDqgeKtzAJctczpLwFZzAse7WuozdvzEFYJ4,7437
93
93
  konduktor/utils/validator.py,sha256=tgBghVyedyzGx84-U2Qfoh_cJBE3oUk9gclMW90ORks,691
94
- konduktor_nightly-0.1.0.dev20250519104943.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
95
- konduktor_nightly-0.1.0.dev20250519104943.dist-info/METADATA,sha256=8K22_WDfcgSJrA_5XvaXrjj6PzsAT0cKetHf1jgTTMA,4366
96
- konduktor_nightly-0.1.0.dev20250519104943.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
97
- konduktor_nightly-0.1.0.dev20250519104943.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
98
- konduktor_nightly-0.1.0.dev20250519104943.dist-info/RECORD,,
94
+ konduktor_nightly-0.1.0.dev20250521104900.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
95
+ konduktor_nightly-0.1.0.dev20250521104900.dist-info/METADATA,sha256=Zy1uJzy9wocXmmbkhqofIoykrQ_Re5xwCHvnj6jxJGM,4366
96
+ konduktor_nightly-0.1.0.dev20250521104900.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
97
+ konduktor_nightly-0.1.0.dev20250521104900.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
98
+ konduktor_nightly-0.1.0.dev20250521104900.dist-info/RECORD,,