konduktor-nightly 0.1.0.dev20250716105229__py3-none-any.whl → 0.1.0.dev20250718105306__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
konduktor/__init__.py CHANGED
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = 'd7075fa88a909a1b4ccf58487d0de364c2a74e7d'
17
+ _KONDUKTOR_COMMIT_SHA = '84818710a16e0a0515fbbd7878395fca37cf94f7'
18
18
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
19
19
 
20
20
 
@@ -48,5 +48,5 @@ def _get_git_commit():
48
48
 
49
49
 
50
50
  __commit__ = _get_git_commit()
51
- __version__ = '1.0.0.dev0.1.0.dev20250716105229'
51
+ __version__ = '1.0.0.dev0.1.0.dev20250718105306'
52
52
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -0,0 +1 @@
1
+ KONDUKTOR_SSH_PORT = 2222
@@ -18,6 +18,7 @@ if typing.TYPE_CHECKING:
18
18
 
19
19
  import konduktor
20
20
  from konduktor import authentication, config, constants, kube_client, logging
21
+ from konduktor.backends import constants as backend_constants
21
22
  from konduktor.data import registry
22
23
  from konduktor.utils import (
23
24
  common_utils,
@@ -171,7 +172,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
171
172
  f'though specified by `tailscale.secret_name`: {err}'
172
173
  )
173
174
 
174
- enable_ssh = config.get_nested(('ssh', 'enable'), False)
175
+ enable_ssh = config.get_nested(('ssh', 'enable'), False) or tailscale_secret
175
176
  secret_name = None
176
177
  if enable_ssh:
177
178
  private_key_path, public_key_path = authentication.get_or_generate_keys()
@@ -257,6 +258,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
257
258
  # SSH
258
259
  'enable_ssh': enable_ssh,
259
260
  'secret_name': secret_name,
261
+ 'konduktor_ssh_port': backend_constants.KONDUKTOR_SSH_PORT,
260
262
  # Kinds of Secrets
261
263
  # --kind git-ssh
262
264
  'git_ssh': git_ssh_secret_name,
@@ -290,6 +292,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
290
292
  pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = list(
291
293
  env_map.values()
292
294
  )
295
+ logger.debug(f'rendered pod spec: \n\t{pod_config}')
293
296
 
294
297
  # validate pod spec using json schema
295
298
  try:
@@ -20,7 +20,8 @@ kubernetes:
20
20
  - name: konduktor-container
21
21
  {% if enable_ssh %}
22
22
  ports:
23
- - containerPort: 2222
23
+ - name: ssh
24
+ containerPort: {{ konduktor_ssh_port }}
24
25
  {% endif %}
25
26
  image: {{ image_id }}
26
27
  # this is set during jobset definition since we need to know the jobset
@@ -38,6 +39,10 @@ kubernetes:
38
39
  valueFrom:
39
40
  fieldRef:
40
41
  fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
42
+ - name: LOCAL_ADDR
43
+ valueFrom:
44
+ fieldRef:
45
+ fieldPath: status.podIP
41
46
  - name: NUM_NODES
42
47
  value: "{{ num_nodes }}"
43
48
  - name: NUM_GPUS_PER_NODE
@@ -71,6 +76,8 @@ kubernetes:
71
76
  secretKeyRef:
72
77
  name: {{ secret_name }}
73
78
  key: PRIVKEY
79
+ - name: KONDUKTOR_SSH_PORT
80
+ value: "{{ konduktor_ssh_port }}"
74
81
  {% endif %}
75
82
  {% if git_ssh %}
76
83
  - name: GIT_SSH_COMMAND
@@ -113,7 +120,8 @@ kubernetes:
113
120
  - |
114
121
  # TODO(asaiacai): add debug environment variable for printing the apt-update, apt-install, sync-files output
115
122
  # Helper function to conditionally use sudo
116
- # set -eo pipefail
123
+ export RDZV_CONF=is_host=$(if [ "$RANK" == "0" ]; then echo "true"; else echo "false"; fi)
124
+ set -eo pipefail
117
125
  {% if konduktor_debug %}
118
126
  set -x
119
127
  {% endif %}
@@ -170,7 +178,7 @@ kubernetes:
170
178
  DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES 2>&1 | tee -a ~/.konduktor/tmp/apt-install.log;
171
179
  fi;
172
180
  end_epoch=$(date +%s);
173
-
181
+
174
182
  echo "Exposing ENV variables"
175
183
  $(prefix_cmd) env -0 | awk -v RS='\0' '
176
184
  {
@@ -188,6 +196,7 @@ kubernetes:
188
196
  function InstallSSH {
189
197
  export DEBIAN_FRONTEND=noninteractive
190
198
  export TZ=Etc/UTC
199
+ set -u
191
200
  if service sshd status > /dev/null 2>&1; then
192
201
  $(prefix_cmd) echo "OpenSSH server is already started."
193
202
  return
@@ -245,14 +254,15 @@ kubernetes:
245
254
  # turn off PAM to fix sshd login issue
246
255
  $(prefix_cmd) sed -i 's/UsePAM yes/UsePAM no/' /etc/ssh/sshd_config
247
256
 
248
- # set default port to 2222
249
- $(prefix_cmd) sed -i 's/#Port 22/Port 2222/' /etc/ssh/sshd_config
257
+ # set default port to 22
258
+ $(prefix_cmd) sed -i 's/#Port 22/Port {{ konduktor_ssh_port }}/' /etc/ssh/sshd_config
250
259
 
251
260
  $(prefix_cmd) mkdir /run/sshd
252
261
  $(prefix_cmd) chmod 0755 /run/sshd
253
262
 
254
263
  $(prefix_cmd) service ssh start
255
264
  $(prefix_cmd) echo "sshd service started"
265
+ set +u
256
266
  }
257
267
 
258
268
  InstallSSH
@@ -266,7 +276,7 @@ kubernetes:
266
276
  if ! command -v tailscale >/dev/null 2>&1; then
267
277
  $(prefix_cmd) curl -fsSL https://tailscale.com/install.sh | DEBIAN_FRONTEND=noninteractive $(prefix_cmd) sh > ~/.konduktor/tmp/tailscale-install.log 2>&1
268
278
  fi
269
- $(prefix_cmd) tailscaled --tun=userspace-networking >/dev/null 2>&1 &
279
+ $(prefix_cmd) tailscaled --tun=userspace-networking --state=mem: >/dev/null 2>&1 &
270
280
  $(prefix_cmd) timeout 5 tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME}
271
281
  $(prefix_cmd) sleep 10
272
282
  done
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250716105229
3
+ Version: 0.1.0.dev20250718105306
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -1,4 +1,4 @@
1
- konduktor/__init__.py,sha256=zYoBNEW7JjgSjqIRFfaHeNgMTT2K8KGedqy-SUCRYa8,1540
1
+ konduktor/__init__.py,sha256=XHQ2EP7giHWRyU98p0JIMey3Wv9AFufniNYul3Cjm_k,1540
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
4
4
  konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
@@ -6,8 +6,9 @@ konduktor/adaptors/gcp.py,sha256=ierTF4z7vwpJ9BsC7LSiwv4uLcjGXscwZOwQrddr2vM,410
6
6
  konduktor/authentication.py,sha256=_mVy3eqoKohicHostFiGwG1-2ybxP-l7ouofQ0LRlCY,4570
7
7
  konduktor/backends/__init__.py,sha256=1Q6sqqdeMYarpTX_U-QVywJYf7idiUTRsyP-E4BQSOw,129
8
8
  konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
9
+ konduktor/backends/constants.py,sha256=nU_cd4x8V2GwP9-oGlcIwjt5snnyhmOlxXbXRZ8d6Fc,26
9
10
  konduktor/backends/jobset.py,sha256=UdhwAuZODLMbLY51Y2zOBsh6wg4Pb84oHVvUKzx3Z2w,8434
10
- konduktor/backends/jobset_utils.py,sha256=sE3USQ15rcXeTCmkysN5YYugiYcaqhWS65vdJRZvlJA,22827
11
+ konduktor/backends/jobset_utils.py,sha256=zOxXikz5fRsciTIdEgK3mQNxJfsRVEiW1K6FQI5ZD3Y,23042
11
12
  konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
12
13
  konduktor/cli.py,sha256=GXmm4DGLHuvfnqDG_3PsWXK7mOI3XWvlC3VeI6JDoqI,36004
13
14
  konduktor/config.py,sha256=J50JxC6MsXMnlrJPXdDUMr38C89xvOO7mR8KJ6fyils,15520
@@ -71,7 +72,7 @@ konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw
71
72
  konduktor/resource.py,sha256=nHgPWXCbWj5sWyslNngrFypMN1K0Dksb0yHbJqWaei8,19612
72
73
  konduktor/task.py,sha256=ofwd8WIhfD6C3ThLcv6X3GUzQHyZ6ddjUagE-umF4K0,35207
73
74
  konduktor/templates/jobset.yaml.j2,sha256=rdURknodtgLp4zoA2PX86Nn4wPpi3tr5l4IG55aWBRg,1059
74
- konduktor/templates/pod.yaml.j2,sha256=PagIrQnON2L32m6A1tdFnO2ieF2Lzggm8AwISWQB-Kk,16723
75
+ konduktor/templates/pod.yaml.j2,sha256=GoBNypXgPkFfmD-a4t0WVCo9sXbgSedvl-YNygso-Fc,17160
75
76
  konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
77
  konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
77
78
  konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -91,8 +92,8 @@ konduktor/utils/schemas.py,sha256=VGPERAso2G4sVAznsJ80qT2Q-I_EFxXw6Rfcw-vkYgQ,16
91
92
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
92
93
  konduktor/utils/ux_utils.py,sha256=czCwiS1bDqgeKtzAJctczpLwFZzAse7WuozdvzEFYJ4,7437
93
94
  konduktor/utils/validator.py,sha256=uCRlScO1NYxsbTNKY9dkoqvlO8S0ISIIB8XmX2ItcO8,2793
94
- konduktor_nightly-0.1.0.dev20250716105229.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
95
- konduktor_nightly-0.1.0.dev20250716105229.dist-info/METADATA,sha256=b7CQSOl3ZWKLIni_LBzurM0ff7pjhOXpMH8KVRxNOzE,4247
96
- konduktor_nightly-0.1.0.dev20250716105229.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
97
- konduktor_nightly-0.1.0.dev20250716105229.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
98
- konduktor_nightly-0.1.0.dev20250716105229.dist-info/RECORD,,
95
+ konduktor_nightly-0.1.0.dev20250718105306.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
96
+ konduktor_nightly-0.1.0.dev20250718105306.dist-info/METADATA,sha256=YAZVZrsrhsj59NUbTfOQ7smp0gxbTv9an9SngBozyZA,4247
97
+ konduktor_nightly-0.1.0.dev20250718105306.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
98
+ konduktor_nightly-0.1.0.dev20250718105306.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
99
+ konduktor_nightly-0.1.0.dev20250718105306.dist-info/RECORD,,