konduktor-nightly 0.1.0.dev20250430104745__py3-none-any.whl → 0.1.0.dev20250502104736__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
konduktor/__init__.py CHANGED
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = 'd1d19dd0b1d1e1440aad10115f235e2b6ea95dd7'
17
+ _KONDUKTOR_COMMIT_SHA = '8035a53e2bd3595665be51149e8287dc721d5edb'
18
18
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
19
19
 
20
20
 
@@ -48,5 +48,5 @@ def _get_git_commit():
48
48
 
49
49
 
50
50
  __commit__ = _get_git_commit()
51
- __version__ = '1.0.0.dev0.1.0.dev20250430104745'
51
+ __version__ = '1.0.0.dev0.1.0.dev20250502104736'
52
52
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -15,9 +15,15 @@ if typing.TYPE_CHECKING:
15
15
  from datetime import timedelta
16
16
 
17
17
  import konduktor
18
- from konduktor import constants, kube_client, logging
18
+ from konduktor import config, constants, kube_client, logging
19
19
  from konduktor.data import registry
20
- from konduktor.utils import common_utils, kubernetes_utils, log_utils
20
+ from konduktor.utils import (
21
+ common_utils,
22
+ exceptions,
23
+ kubernetes_utils,
24
+ log_utils,
25
+ ux_utils,
26
+ )
21
27
 
22
28
  if typing.TYPE_CHECKING:
23
29
  pass
@@ -42,7 +48,7 @@ _JOBSET_METADATA_LABELS = {
42
48
  'jobset_accelerator_label': JOBSET_ACCELERATOR_LABEL,
43
49
  'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
44
50
  }
45
- _RUN_DURATION_ANNOTATION = 'maxRunDurationSeconds'
51
+
46
52
  _RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
47
53
 
48
54
 
@@ -130,6 +136,20 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
130
136
  storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
131
137
  sync_commands.append(cloud_store.make_sync_file_command(src, dst))
132
138
 
139
+ tailscale_secret = config.get_nested(('tailscale', 'secret_name'), None)
140
+ if tailscale_secret:
141
+ context = kubernetes_utils.get_current_kube_config_context_name()
142
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
143
+ secret_exist, err = kubernetes_utils.check_secret_exists(
144
+ tailscale_secret, namespace, context
145
+ )
146
+ if not secret_exist:
147
+ with ux_utils.print_exception_no_traceback():
148
+ raise exceptions.MissingSecretError(
149
+ f'No tailscale auth-key secret `{tailscale_secret}` found even '
150
+ f'though specified by `tailscale.secret_name`: {err}'
151
+ )
152
+
133
153
  assert task.resources is not None, 'Task resources are required'
134
154
  assert task.resources.cpus is not None, 'Task resources cpus are required'
135
155
  assert task.resources.memory is not None, 'Task resources memory are required'
@@ -154,6 +174,8 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
154
174
  'mount_secrets': storage_secrets,
155
175
  'remote_workdir': constants.KONDUKTOR_REMOTE_WORKDIR,
156
176
  'user': common_utils.get_cleaned_username(),
177
+ # Tailscale credentials
178
+ 'tailscale_secret': tailscale_secret,
157
179
  },
158
180
  temp.name,
159
181
  )
@@ -16,8 +16,7 @@ kubernetes:
16
16
  operator: "Exists"
17
17
  {% endif %}
18
18
  containers:
19
- # TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
20
- # TODO(asaiacai): add ulimits
19
+ # TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
21
20
  - name: konduktor-container
22
21
  image: {{ image_id }}
23
22
  # this is set during jobset definition since we need to know the jobset
@@ -39,6 +38,24 @@ kubernetes:
39
38
  value: "{{ num_nodes }}"
40
39
  - name: NUM_GPUS_PER_NODE
41
40
  value: "{{ num_gpus }}"
41
+ {% if tailscale_secret %}
42
+ - name: TS_USERSPACE
43
+ value: "true"
44
+ - name: TS_AUTHKEY
45
+ valueFrom:
46
+ secretKeyRef:
47
+ name: {{ tailscale_secret }}
48
+ key: TS_AUTHKEY
49
+ optional: true
50
+ - name: POD_NAME
51
+ valueFrom:
52
+ fieldRef:
53
+ fieldPath: metadata.name
54
+ - name: POD_UID
55
+ valueFrom:
56
+ fieldRef:
57
+ fieldPath: metadata.uid
58
+ {% endif %}
42
59
  # these are for compatibility with skypilot
43
60
  - name: SKYPILOT_NODE_IPS
44
61
  value: "{{ node_hostnames }}"
@@ -58,7 +75,6 @@ kubernetes:
58
75
  mountPath: /run/konduktor/{{ secret_type }}-secret
59
76
  {% endfor %}
60
77
  command: ["bash", "-c"]
61
- # TODO(asaiacai): should we just mount this as a configmap instead? - Edit: probably not
62
78
  args:
63
79
  - |
64
80
  # TODO(asaiacai): add debug environment variable for printing the apt-update, apt-install, sync-files output
@@ -76,7 +92,7 @@ kubernetes:
76
92
  {% if 'rsync' in run_cmd %}
77
93
  PACKAGES="$PACKAGES rsync";
78
94
  {% endif %}
79
- {% if 'curl' in run_cmd %}
95
+ {% if 'curl' in run_cmd or tailscale_secret %}
80
96
  PACKAGES="$PACKAGES curl";
81
97
  {% endif %}
82
98
  {% if 'gs' in mount_secrets or 's3' in mount_secrets %}
@@ -117,6 +133,17 @@ kubernetes:
117
133
  DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES >> ~/.konduktor/tmp/apt-install.log;
118
134
  fi;
119
135
  end_epoch=$(date +%s);
136
+
137
+ {% if tailscale_secret %}
138
+ if ! command -v tailscale >/dev/null 2>&1; then
139
+ export TS_HOSTNAME=$(echo "$POD_NAME" | sed 's/-[^-]*$//')
140
+ $(prefix_cmd) curl -fsSL https://tailscale.com/install.sh | DEBIAN_FRONTEND=noninteractive $(prefix_cmd) sh >> ~/.konduktor/tmp/tailscale-install.log
141
+ $(prefix_cmd) tailscaled --tun=userspace-networking >/dev/null 2>&1 &
142
+ $(prefix_cmd) tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME} >/dev/null 2>&1
143
+ fi
144
+ {% endif %}
145
+ end_epoch=$(date +%s);
146
+
120
147
  $(prefix_cmd) echo "===== KONDUKTOR: Installing packages took $((end_epoch - start_epoch)) seconds ====="
121
148
 
122
149
  # unpack secrets credentials
@@ -153,6 +180,7 @@ kubernetes:
153
180
  {% endif %}
154
181
  end_epoch=$(date +%s);
155
182
  end_setup_time=$((end_epoch - start_setup));
183
+ ulimit -Sc 0 && ulimit -Hc 0
156
184
  $(prefix_cmd) echo "===== KONDUKTOR: Initialization took $end_setup_time seconds ====="
157
185
  # run task
158
186
  $(prefix_cmd) cd {{ remote_workdir }}
@@ -180,7 +208,8 @@ kubernetes:
180
208
  {% endif %}
181
209
  securityContext:
182
210
  capabilities:
183
- add: ["IPC_LOCK"] # May be needed for memlock
211
+ add:
212
+ - "IPC_LOCK" # May be needed for memlock
184
213
 
185
214
  volumes:
186
215
  - name: shared-memory
@@ -130,6 +130,10 @@ class CommandError(Exception):
130
130
  pass
131
131
 
132
132
 
133
+ class MissingSecretError(Exception):
134
+ pass
135
+
136
+
133
137
  class NotSupportedError(Exception):
134
138
  """Raised when a feature is not supported."""
135
139
 
@@ -563,6 +563,17 @@ def get_config_schema():
563
563
  },
564
564
  }
565
565
 
566
+ tailscale_configs = {
567
+ 'type': 'object',
568
+ 'required': [],
569
+ 'additionalProperties': False,
570
+ 'properties': {
571
+ 'secret_name': {
572
+ 'type': 'string',
573
+ },
574
+ },
575
+ }
576
+
566
577
  for cloud, config in cloud_configs.items():
567
578
  if cloud == 'kubernetes':
568
579
  config['properties'].update(_REMOTE_IDENTITY_SCHEMA_KUBERNETES)
@@ -577,6 +588,7 @@ def get_config_schema():
577
588
  'admin_policy': admin_policy_schema,
578
589
  'nvidia_gpus': gpu_configs,
579
590
  'allowed_clouds': allowed_clouds,
591
+ 'tailscale': tailscale_configs,
580
592
  **cloud_configs,
581
593
  },
582
594
  }
@@ -11,6 +11,7 @@ from typing import Callable, Optional, Union
11
11
  import colorama
12
12
  import rich.console as rich_console
13
13
 
14
+ from konduktor import config
14
15
  from konduktor import logging as konduktor_logging
15
16
 
16
17
  if typing.TYPE_CHECKING:
@@ -211,6 +212,11 @@ def command_hint_messages(hint_type: CommandHintType, job_id: str) -> str:
211
212
  )
212
213
  hint_str = '\n📋 Useful Commands'
213
214
  hint_str += f'{job_hint_str}'
215
+ if config.get_nested(('tailscale', 'secret_name'), None) is not None:
216
+ hint_str += (
217
+ f'\n{INDENT_SYMBOL}To tailscale ssh:\t\t'
218
+ f'{BOLD}ssh root@{job_id}-workers-0-0 {RESET_BOLD}'
219
+ )
214
220
  return hint_str
215
221
  else:
216
222
  raise ValueError(f'Invalid hint type: {hint_type}')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250430104745
3
+ Version: 0.1.0.dev20250502104736
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -1,4 +1,4 @@
1
- konduktor/__init__.py,sha256=Fur_FhjkiRaTC8mYuUo1rf0_RXIiVRM9U9oUYdDL5m8,1540
1
+ konduktor/__init__.py,sha256=jMTLcTV0GSBv8eb57oCZaR9dTjrcnVas0dkuTq_7HAg,1540
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
4
4
  konduktor/adaptors/common.py,sha256=uTdpKvgBSwYMmynx9wR5kiZQyTrdaw9ZI4KH6Z2E5Hw,4296
@@ -6,7 +6,7 @@ konduktor/adaptors/gcp.py,sha256=ierTF4z7vwpJ9BsC7LSiwv4uLcjGXscwZOwQrddr2vM,410
6
6
  konduktor/backends/__init__.py,sha256=1Q6sqqdeMYarpTX_U-QVywJYf7idiUTRsyP-E4BQSOw,129
7
7
  konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
8
8
  konduktor/backends/jobset.py,sha256=veptYGXtk-ugWxBsBV5SnqI4rGKOlGfm_N3wApvNhSQ,8326
9
- konduktor/backends/jobset_utils.py,sha256=UJkDu6Y8u4N2AaNSJTOSgbGLyY25bzaP-I6esJ11jms,17578
9
+ konduktor/backends/jobset_utils.py,sha256=B0N0sx-pWF9_CDeuSXAU4nm3ZIwroyVcq6aUAlNZZRs,18376
10
10
  konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
11
11
  konduktor/cli.py,sha256=Ii9-2mrc-1f2ksLasA-xRb-JnEi_9ZeCXZ3lJ1GG8H8,23515
12
12
  konduktor/config.py,sha256=J50JxC6MsXMnlrJPXdDUMr38C89xvOO7mR8KJ6fyils,15520
@@ -70,7 +70,7 @@ konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw
70
70
  konduktor/resource.py,sha256=68z8gC8Ivqktwv0R6ylMn9ZNocgkcRT0yIRGGKOdwcM,18491
71
71
  konduktor/task.py,sha256=Vu1TzYtLvSBz-HyHY2gsM2cMcUhMNQu44L3CWmYRXKE,35232
72
72
  konduktor/templates/jobset.yaml.j2,sha256=onYiHtXAgk-XBtji994hPu_g0hxnLzvmfxwjbdKdeZc,960
73
- konduktor/templates/pod.yaml.j2,sha256=q_kFyuYP-RlHocQUV8Vsnh-hYurWO7Y_pwl0gytGMl4,8728
73
+ konduktor/templates/pod.yaml.j2,sha256=s3eECjLevUWR-zvyeI8WjQWxQYJh_AMk1tdQVGNXpEM,9835
74
74
  konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
75
75
  konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
76
76
  konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -80,18 +80,18 @@ konduktor/utils/base64_utils.py,sha256=mF-Tw98mFRG70YE4w6s9feuQSCYZHOb8YatBZwMug
80
80
  konduktor/utils/common_utils.py,sha256=F5x7k4AdBB44u8PYRkaugORnZKnK3JLqGn1jHOKgUYo,14960
81
81
  konduktor/utils/constants.py,sha256=1DneiTR21lvKUcWdBGwC4I4fD4uPjbjLUilEnJS7rzA,216
82
82
  konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4xc,2258
83
- konduktor/utils/exceptions.py,sha256=GBOFIkk9nikqWGR0FXGXOWVVImoH7nWnMl_L3Oux3fo,6581
83
+ konduktor/utils/exceptions.py,sha256=IHyaP5ERZpPvWZeKWV3MVTyKsxo2Fq-13nhI0PRNQzk,6629
84
84
  konduktor/utils/kubernetes_enums.py,sha256=SabUueF6Bpzbpa57gyH5VB65xla2N9l8CZmAeYTfGmM,176
85
85
  konduktor/utils/kubernetes_utils.py,sha256=ivFVh90Gez19_JD5U4bgCO5zNtQUflF0hJsM5nZLj8A,23864
86
86
  konduktor/utils/log_utils.py,sha256=lgHCq4OdtJNfbpso-uYGONUCVNsUrUkUWjROarsHt6s,9897
87
87
  konduktor/utils/loki_utils.py,sha256=ND1pbbbFhLhLKw3870j44LpR_9MB0EkDJSs5K7nWdY4,3473
88
88
  konduktor/utils/rich_utils.py,sha256=kdjNe6S2LlpOxyzhFHqMzCz7g4ROC4e7TPWgcbRsrQE,3577
89
- konduktor/utils/schemas.py,sha256=Gv7SEhFpv-eO5izqRz8d-eQ9z-lVmY05akm6HEXIIdc,17478
89
+ konduktor/utils/schemas.py,sha256=_VCWnsSgyP3u5cpACEmJeuqcy5mzu_fr0McHyZdiXd8,17757
90
90
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
91
- konduktor/utils/ux_utils.py,sha256=NPNu3Igu2Z9Oq77ghJhy_fIxQZTXWr9BtKyxN3Wslzo,7164
91
+ konduktor/utils/ux_utils.py,sha256=czCwiS1bDqgeKtzAJctczpLwFZzAse7WuozdvzEFYJ4,7437
92
92
  konduktor/utils/validator.py,sha256=tgBghVyedyzGx84-U2Qfoh_cJBE3oUk9gclMW90ORks,691
93
- konduktor_nightly-0.1.0.dev20250430104745.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
94
- konduktor_nightly-0.1.0.dev20250430104745.dist-info/METADATA,sha256=A9ws8bq2H4K0J6wyXSPrZWjkbMGgE0cGHDJA4R3W7To,4366
95
- konduktor_nightly-0.1.0.dev20250430104745.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
96
- konduktor_nightly-0.1.0.dev20250430104745.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
97
- konduktor_nightly-0.1.0.dev20250430104745.dist-info/RECORD,,
93
+ konduktor_nightly-0.1.0.dev20250502104736.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
94
+ konduktor_nightly-0.1.0.dev20250502104736.dist-info/METADATA,sha256=smj8TlY9k4hFEtZNDLlnl5W-b8UjtFepi-iTqGiuI9o,4366
95
+ konduktor_nightly-0.1.0.dev20250502104736.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
96
+ konduktor_nightly-0.1.0.dev20250502104736.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
97
+ konduktor_nightly-0.1.0.dev20250502104736.dist-info/RECORD,,