konduktor-nightly 0.1.0.dev20250815104918__py3-none-any.whl → 0.1.0.dev20250817104709__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of konduktor-nightly might be problematic. Click here for more details.

konduktor/__init__.py CHANGED
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = '9e2a8dc9b3415f811b7c8f1e7ed4c8d9e941ec61'
14
+ _KONDUKTOR_COMMIT_SHA = '108d7fe47b1bd5db50d555510714d2e204fb7b6f'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20250815104918'
48
+ __version__ = '1.0.0.dev0.1.0.dev20250817104709'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -176,7 +176,7 @@ class JobsetBackend(backend.Backend):
176
176
  context = kubernetes_utils.get_current_kube_config_context_name()
177
177
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
178
178
  # TODO(asaiacai): need to set env variables in pod
179
- jobset_response = jobset_utils.create_jobset(
179
+ jobset_utils.create_jobset(
180
180
  namespace,
181
181
  task,
182
182
  pod_spec['kubernetes']['pod_config'],
@@ -192,10 +192,9 @@ class JobsetBackend(backend.Backend):
192
192
  ):
193
193
  _wait_for_jobset_start(namespace, task.name)
194
194
  try:
195
- assert jobset_response is not None
196
195
  log_thread = threading.Thread(
197
196
  target=log_utils.tail_logs,
198
- args=(jobset_response,),
197
+ args=(task.name,),
199
198
  daemon=True,
200
199
  )
201
200
  logger.info('streaming logs...')
@@ -7,6 +7,16 @@ kubernetes:
7
7
  {% if accelerator_type %}
8
8
  trainy.ai/accelerator: {{ accelerator_type }}
9
9
  {% endif %}
10
+ {% if konduktor_debug %}
11
+ konduktor.ai/debug: "true"
12
+ {% else %}
13
+ konduktor.ai/debug: "false"
14
+ {% endif %}
15
+ {% if tailscale_secret %}
16
+ konduktor.ai/tailscale: "true"
17
+ {% else %}
18
+ konduktor.ai/tailscale: "false"
19
+ {% endif %}
10
20
  spec:
11
21
  restartPolicy: "Never"
12
22
  # trigger this on GPU request
@@ -312,13 +322,18 @@ kubernetes:
312
322
  if ! command -v tailscale >/dev/null 2>&1; then
313
323
  $(prefix_cmd) curl -fsSL https://tailscale.com/install.sh | DEBIAN_FRONTEND=noninteractive $(prefix_cmd) sh > ~/.konduktor/tmp/tailscale-install.log 2>&1
314
324
  fi
315
- $(prefix_cmd) tailscaled --tun=userspace-networking --state=mem: >/dev/null 2>&1 &
316
- $(prefix_cmd) timeout 5 tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME}
317
- $(prefix_cmd) sleep 10
325
+ $(prefix_cmd) tailscaled --tun=userspace-networking --state=mem: >~/.konduktor/tmp/tailscaled.log 2>&1 &
326
+ $(prefix_cmd) sleep 2
327
+ $(prefix_cmd) timeout 5 tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME} || echo "tailscale up failed retrying"
328
+ $(prefix_cmd) sleep 2
318
329
  done
319
330
  $(prefix_cmd) echo "Tailscale is up"
331
+ $(prefix_cmd) tailscale status
320
332
  }
321
333
  InstallTailscale | tee ~/.konduktor/tmp/tailscale-out.log
334
+ {% if konduktor_debug %}
335
+ $(prefix_cmd) cat ~/.konduktor/tmp/tailscale*.log
336
+ {% endif %}
322
337
  {% endif %}
323
338
  end_epoch=$(date +%s);
324
339
 
@@ -416,8 +431,8 @@ kubernetes:
416
431
  secretName: {{ git_ssh }}
417
432
  defaultMode: 384
418
433
  {% endif %}
419
-
420
434
 
435
+
421
436
  # TODO(asaiacai): should we add nodeSelectors here or leave to
422
437
  # kueue resource flavors. leaning towards defining
423
438
  # in kueue and just querying for the kueue resource flavor
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250815104918
3
+ Version: 0.1.0.dev20250817104709
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -1,4 +1,4 @@
1
- konduktor/__init__.py,sha256=IJrSlRG-q24AZUQh3Sx5BuKRBdiiWfGk1J6UV0Od5KI,1574
1
+ konduktor/__init__.py,sha256=zsVeeXPLti6TupL5auk0Nzz-d4K3Ukz0cKs7Oex2LyQ,1574
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
4
4
  konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
@@ -9,7 +9,7 @@ konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0
9
9
  konduktor/backends/constants.py,sha256=nt9G9AmFCOMwO4GuKgRQSzJJuKapOmaROp4_Y0tMF5A,732
10
10
  konduktor/backends/deployment.py,sha256=EHfB2uLeKFQ3maek9tx6XL4_sjQ-ax59DZA79Q3EkVs,5519
11
11
  konduktor/backends/deployment_utils.py,sha256=VGuL01rKe7p7PoVRI_cP4tiZRxHZ13nnTMG-bmDf7P0,28975
12
- konduktor/backends/jobset.py,sha256=E9THHmcpxTohsx6Goi9mKF4dy_mYpR2DHloSwGVr9jA,8509
12
+ konduktor/backends/jobset.py,sha256=OwgDog9nH-FoUmNU_H--C3U5jx70reTKL1l849M1k5A,8430
13
13
  konduktor/backends/jobset_utils.py,sha256=7fB8X4b2Q5BKFCIGME72dyeCfi-EemoMeJVnwtzcjq4,25184
14
14
  konduktor/backends/pod_utils.py,sha256=Jfv_CY8suF0e7QEaeQiNRRxRnOueLgPR8SfLEO7lnwc,15260
15
15
  konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
@@ -77,7 +77,7 @@ konduktor/serving.py,sha256=sh8TPAUXg23Bkt0ByatIMdxFFqzRm18HJTEkt3wHzdo,5147
77
77
  konduktor/task.py,sha256=97iLCo62qpN9wLGNPeFw64E8k1nch7AyySY3BUXHPWY,37496
78
78
  konduktor/templates/deployment.yaml.j2,sha256=uXFjDQaimbpFdAn2RJGaIvS_PzDY136cw_L3QMjz3ZA,3452
79
79
  konduktor/templates/jobset.yaml.j2,sha256=gURWl6uQv_OLni-LFy2E7ttjGOtuRDt5Vfs4ALH7fpI,1196
80
- konduktor/templates/pod.yaml.j2,sha256=TORLvkLthUc4KXTGccIJbNaE-y3uXwlZFHP6zebxhdI,18238
80
+ konduktor/templates/pod.yaml.j2,sha256=gxBaHFwDfRE71nh3glYPsmirOr6Qn__-f6oHmQRP4QU,18809
81
81
  konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
82
  konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
83
83
  konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -97,8 +97,8 @@ konduktor/utils/schemas.py,sha256=tBrKhnkfn9uKDYdlb4L2KgooW-muuhww7U8fu9zX-ms,18
97
97
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
98
98
  konduktor/utils/ux_utils.py,sha256=7-Lt3QbDVvBQUli5_U9lOdXKeC-ip8rZBpO9gQ6vPJw,7955
99
99
  konduktor/utils/validator.py,sha256=5C1kE57Eyj1OPnAbvojqMNHHtf5fnl47FK_vEttd8aw,4331
100
- konduktor_nightly-0.1.0.dev20250815104918.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
101
- konduktor_nightly-0.1.0.dev20250815104918.dist-info/METADATA,sha256=nK82xb6Qw94vR9UyKVmUma1MBA0geg6QUNky96zpsJU,4247
102
- konduktor_nightly-0.1.0.dev20250815104918.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
103
- konduktor_nightly-0.1.0.dev20250815104918.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
104
- konduktor_nightly-0.1.0.dev20250815104918.dist-info/RECORD,,
100
+ konduktor_nightly-0.1.0.dev20250817104709.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
101
+ konduktor_nightly-0.1.0.dev20250817104709.dist-info/METADATA,sha256=HBqNt0M8Qy0Jk_SUAquvJyQXnRZaQybu2yeeqYWkMDo,4247
102
+ konduktor_nightly-0.1.0.dev20250817104709.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
103
+ konduktor_nightly-0.1.0.dev20250817104709.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
104
+ konduktor_nightly-0.1.0.dev20250817104709.dist-info/RECORD,,