konduktor-nightly 0.1.0.dev20250715105209__py3-none-any.whl → 0.1.0.dev20250717105242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
konduktor/__init__.py CHANGED
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = '5760f2c18ed487270e9244d22c7209eee12821c1'
17
+ _KONDUKTOR_COMMIT_SHA = '84818710a16e0a0515fbbd7878395fca37cf94f7'
18
18
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
19
19
 
20
20
 
@@ -48,5 +48,5 @@ def _get_git_commit():
48
48
 
49
49
 
50
50
  __commit__ = _get_git_commit()
51
- __version__ = '1.0.0.dev0.1.0.dev20250715105209'
51
+ __version__ = '1.0.0.dev0.1.0.dev20250717105242'
52
52
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -0,0 +1 @@
1
+ KONDUKTOR_SSH_PORT = 2222
@@ -18,6 +18,7 @@ if typing.TYPE_CHECKING:
18
18
 
19
19
  import konduktor
20
20
  from konduktor import authentication, config, constants, kube_client, logging
21
+ from konduktor.backends import constants as backend_constants
21
22
  from konduktor.data import registry
22
23
  from konduktor.utils import (
23
24
  common_utils,
@@ -79,6 +80,8 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
79
80
  Returns:
80
81
  Dict[str, Any]: k8s pod spec
81
82
  """
83
+ context = kubernetes_utils.get_current_kube_config_context_name()
84
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
82
85
 
83
86
  # fill out the templating variables
84
87
  assert task.resources is not None, 'Task resources are required'
@@ -107,7 +110,6 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
107
110
  sync_commands = []
108
111
  mkdir_commands = []
109
112
  storage_secrets = {}
110
-
111
113
  # first do storage_mount sync
112
114
  for dst, store in task.storage_mounts.items():
113
115
  # TODO(asaiacai) idk why but theres an extra storage mount for the
@@ -122,6 +124,13 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
122
124
  # should impelement a method here instead of raw dog dict access
123
125
  cloud_store = registry._REGISTRY[store_scheme]
124
126
  storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
127
+ exists, _ = kubernetes_utils.check_secret_exists(
128
+ storage_secrets[store_scheme], namespace=namespace, context=context
129
+ )
130
+ assert exists, (
131
+ f"secret {storage_secrets[store_scheme]} doesn't "
132
+ f'exist in namespace {namespace}'
133
+ )
125
134
  mkdir_commands.append(
126
135
  f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};' f'mkdir -p {dst}'
127
136
  )
@@ -142,10 +151,15 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
142
151
  f'mkdir -p {os.path.dirname(dst)}'
143
152
  )
144
153
  storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
154
+ exists, reason = kubernetes_utils.check_secret_exists(
155
+ storage_secrets[store_scheme], namespace=namespace, context=context
156
+ )
157
+ assert exists, (
158
+ f'secret {storage_secrets[store_scheme]} '
159
+ f"doesn't exist in namespace {namespace}"
160
+ )
145
161
  sync_commands.append(cloud_store.make_sync_file_command(src, dst))
146
162
 
147
- context = kubernetes_utils.get_current_kube_config_context_name()
148
- namespace = kubernetes_utils.get_kube_config_context_namespace(context)
149
163
  tailscale_secret = config.get_nested(('tailscale', 'secret_name'), None)
150
164
  if tailscale_secret:
151
165
  secret_exist, err = kubernetes_utils.check_secret_exists(
@@ -158,7 +172,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
158
172
  f'though specified by `tailscale.secret_name`: {err}'
159
173
  )
160
174
 
161
- enable_ssh = config.get_nested(('ssh', 'enable'), False)
175
+ enable_ssh = config.get_nested(('ssh', 'enable'), False) or tailscale_secret
162
176
  secret_name = None
163
177
  if enable_ssh:
164
178
  private_key_path, public_key_path = authentication.get_or_generate_keys()
@@ -188,8 +202,6 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
188
202
  env_secret_envs = []
189
203
  default_secrets = []
190
204
 
191
- context = kubernetes_utils.get_current_kube_config_context_name()
192
- namespace = kubernetes_utils.get_kube_config_context_namespace(context)
193
205
  user_hash = common_utils.get_user_hash()
194
206
  label_selector = f'konduktor/owner={user_hash}'
195
207
  user_secrets = kubernetes_utils.list_secrets(
@@ -246,11 +258,14 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
246
258
  # SSH
247
259
  'enable_ssh': enable_ssh,
248
260
  'secret_name': secret_name,
261
+ 'konduktor_ssh_port': backend_constants.KONDUKTOR_SSH_PORT,
249
262
  # Kinds of Secrets
250
263
  # --kind git-ssh
251
264
  'git_ssh': git_ssh_secret_name,
252
265
  # --kind default
253
266
  'default_secrets': default_secrets,
267
+ # KONDUKTOR_DEBUG
268
+ 'konduktor_debug': os.getenv('KONDUKTOR_DEBUG', 0),
254
269
  },
255
270
  temp.name,
256
271
  )
@@ -277,6 +292,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
277
292
  pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = list(
278
293
  env_map.values()
279
294
  )
295
+ logger.debug(f'rendered pod spec: \n\t{pod_config}')
280
296
 
281
297
  # validate pod spec using json schema
282
298
  try:
konduktor/cli.py CHANGED
@@ -713,6 +713,13 @@ def launch(
713
713
  required=False,
714
714
  )
715
715
  @click.option('--all', '-a', default=None, is_flag=True, help='Tear down all jobs.')
716
+ @click.option(
717
+ '--all-users',
718
+ '--all_users',
719
+ default=False,
720
+ is_flag=True,
721
+ help='Include other users for teardown',
722
+ )
716
723
  @click.option(
717
724
  '--yes',
718
725
  '-y',
@@ -723,7 +730,8 @@ def launch(
723
730
  )
724
731
  def down(
725
732
  jobs: List[str],
726
- all: Optional[bool], # pylint: disable=redefined-builtin
733
+ all: Optional[bool],
734
+ all_users: Optional[bool],
727
735
  yes: bool,
728
736
  ):
729
737
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
@@ -753,26 +761,39 @@ def down(
753
761
  # Tear down all jobs matching a pattern.
754
762
  konduktor down "test-*"
755
763
  \b
756
- # Tear down all existing jobs.
764
+ # Tear down all of this users jobs.
757
765
  konduktor down -a
766
+ konduktor down --all
767
+
768
+ # Tear down all jobs across all users
769
+ konduktor down --all --all-users
758
770
 
759
771
  """
760
772
 
761
773
  context = kubernetes_utils.get_current_kube_config_context_name()
762
774
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
775
+ jobs_response = jobset_utils.list_jobset(namespace)
776
+ assert jobs_response
777
+ jobs_specs = [
778
+ job
779
+ for job in jobs_response['items']
780
+ if (
781
+ job['metadata']['labels'][jobset_utils.JOBSET_USERID_LABEL]
782
+ == common_utils.user_and_hostname_hash()
783
+ and not all_users
784
+ )
785
+ ]
763
786
 
764
787
  if all:
765
- jobs_specs = jobset_utils.list_jobset(namespace)
766
788
  assert jobs_specs is not None, f'No jobs found in namespace {namespace}'
767
789
  assert len(jobs_specs) > 0, f'No jobs found in namespace {namespace}'
768
- jobs = [job['metadata']['name'] for job in jobs_specs['items']]
790
+ jobs = [job['metadata']['name'] for job in jobs_specs]
769
791
  elif jobs:
770
792
  # Get all available jobs to match against patterns
771
- jobs_specs = jobset_utils.list_jobset(namespace)
772
- if jobs_specs is None or len(jobs_specs.get('items', [])) == 0:
793
+ if len(jobs_specs) == 0:
773
794
  raise click.ClickException(f'No jobs found in namespace {namespace}')
774
795
 
775
- all_job_names = [job['metadata']['name'] for job in jobs_specs['items']]
796
+ all_job_names = [job['metadata']['name'] for job in jobs_specs]
776
797
  matched_jobs = []
777
798
 
778
799
  for job_pattern in jobs:
konduktor/data/gcp/gcs.py CHANGED
@@ -162,8 +162,6 @@ class GcsStore(storage_utils.AbstractStore):
162
162
  # This will generate
163
163
  # ~/.config/gcloud/application_default_credentials.json.
164
164
  f'{_INDENT_PREFIX} $ gcloud auth application-default login\n'
165
- f'{_INDENT_PREFIX}For more info: '
166
- 'https://konduktor.readthedocs.io/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # noqa: E501
167
165
  )
168
166
  _APPLICATION_CREDENTIAL_HINT = (
169
167
  'Run the following commands:\n'
@@ -171,8 +169,6 @@ class GcsStore(storage_utils.AbstractStore):
171
169
  f'{_INDENT_PREFIX}Or set the environment variable '
172
170
  'GOOGLE_APPLICATION_CREDENTIALS '
173
171
  'to the path of your service account key file.\n'
174
- f'{_INDENT_PREFIX}For more info: '
175
- 'https://konduktor.readthedocs.ioo/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # noqa: E501
176
172
  )
177
173
 
178
174
  _REPR = 'GcsStore'
@@ -20,7 +20,8 @@ kubernetes:
20
20
  - name: konduktor-container
21
21
  {% if enable_ssh %}
22
22
  ports:
23
- - containerPort: 2222
23
+ - name: ssh
24
+ containerPort: {{ konduktor_ssh_port }}
24
25
  {% endif %}
25
26
  image: {{ image_id }}
26
27
  # this is set during jobset definition since we need to know the jobset
@@ -38,6 +39,10 @@ kubernetes:
38
39
  valueFrom:
39
40
  fieldRef:
40
41
  fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
42
+ - name: LOCAL_ADDR
43
+ valueFrom:
44
+ fieldRef:
45
+ fieldPath: status.podIP
41
46
  - name: NUM_NODES
42
47
  value: "{{ num_nodes }}"
43
48
  - name: NUM_GPUS_PER_NODE
@@ -71,6 +76,8 @@ kubernetes:
71
76
  secretKeyRef:
72
77
  name: {{ secret_name }}
73
78
  key: PRIVKEY
79
+ - name: KONDUKTOR_SSH_PORT
80
+ value: "{{ konduktor_ssh_port }}"
74
81
  {% endif %}
75
82
  {% if git_ssh %}
76
83
  - name: GIT_SSH_COMMAND
@@ -113,7 +120,11 @@ kubernetes:
113
120
  - |
114
121
  # TODO(asaiacai): add debug environment variable for printing the apt-update, apt-install, sync-files output
115
122
  # Helper function to conditionally use sudo
123
+ export RDZV_CONF=is_host=$(if [ "$RANK" == "0" ]; then echo "true"; else echo "false"; fi)
116
124
  set -eo pipefail
125
+ {% if konduktor_debug %}
126
+ set -x
127
+ {% endif %}
117
128
  mkdir -p ~/.konduktor/tmp
118
129
  start_epoch=$(date +%s);
119
130
  start_setup=$(date +%s);
@@ -167,7 +178,7 @@ kubernetes:
167
178
  DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES 2>&1 | tee -a ~/.konduktor/tmp/apt-install.log;
168
179
  fi;
169
180
  end_epoch=$(date +%s);
170
-
181
+
171
182
  echo "Exposing ENV variables"
172
183
  $(prefix_cmd) env -0 | awk -v RS='\0' '
173
184
  {
@@ -185,6 +196,7 @@ kubernetes:
185
196
  function InstallSSH {
186
197
  export DEBIAN_FRONTEND=noninteractive
187
198
  export TZ=Etc/UTC
199
+ set -u
188
200
  if service sshd status > /dev/null 2>&1; then
189
201
  $(prefix_cmd) echo "OpenSSH server is already started."
190
202
  return
@@ -242,14 +254,15 @@ kubernetes:
242
254
  # turn off PAM to fix sshd login issue
243
255
  $(prefix_cmd) sed -i 's/UsePAM yes/UsePAM no/' /etc/ssh/sshd_config
244
256
 
245
- # set default port to 2222
246
- $(prefix_cmd) sed -i 's/#Port 22/Port 2222/' /etc/ssh/sshd_config
257
+ # set default port to 22
258
+ $(prefix_cmd) sed -i 's/#Port 22/Port {{ konduktor_ssh_port }}/' /etc/ssh/sshd_config
247
259
 
248
260
  $(prefix_cmd) mkdir /run/sshd
249
261
  $(prefix_cmd) chmod 0755 /run/sshd
250
262
 
251
263
  $(prefix_cmd) service ssh start
252
264
  $(prefix_cmd) echo "sshd service started"
265
+ set +u
253
266
  }
254
267
 
255
268
  InstallSSH
@@ -263,7 +276,7 @@ kubernetes:
263
276
  if ! command -v tailscale >/dev/null 2>&1; then
264
277
  $(prefix_cmd) curl -fsSL https://tailscale.com/install.sh | DEBIAN_FRONTEND=noninteractive $(prefix_cmd) sh > ~/.konduktor/tmp/tailscale-install.log 2>&1
265
278
  fi
266
- $(prefix_cmd) tailscaled --tun=userspace-networking >/dev/null 2>&1 &
279
+ $(prefix_cmd) tailscaled --tun=userspace-networking --state=mem: >/dev/null 2>&1 &
267
280
  $(prefix_cmd) timeout 5 tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME}
268
281
  $(prefix_cmd) sleep 10
269
282
  done
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250715105209
3
+ Version: 0.1.0.dev20250717105242
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -1,4 +1,4 @@
1
- konduktor/__init__.py,sha256=v-n0dnLTzwfu7Y3lEvTmtk4Cn7-hyB8aYcyeb_AcCgI,1540
1
+ konduktor/__init__.py,sha256=No4XStM7tVXkkg8Ut05Ttw2ZkuKgSK71G639a6FYooM,1540
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
4
4
  konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
@@ -6,10 +6,11 @@ konduktor/adaptors/gcp.py,sha256=ierTF4z7vwpJ9BsC7LSiwv4uLcjGXscwZOwQrddr2vM,410
6
6
  konduktor/authentication.py,sha256=_mVy3eqoKohicHostFiGwG1-2ybxP-l7ouofQ0LRlCY,4570
7
7
  konduktor/backends/__init__.py,sha256=1Q6sqqdeMYarpTX_U-QVywJYf7idiUTRsyP-E4BQSOw,129
8
8
  konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
9
+ konduktor/backends/constants.py,sha256=nU_cd4x8V2GwP9-oGlcIwjt5snnyhmOlxXbXRZ8d6Fc,26
9
10
  konduktor/backends/jobset.py,sha256=UdhwAuZODLMbLY51Y2zOBsh6wg4Pb84oHVvUKzx3Z2w,8434
10
- konduktor/backends/jobset_utils.py,sha256=u5Z3SYv7rmkS1M0l1sHR5QEnJ-mIMk5hrM9WoPjJCoE,22283
11
+ konduktor/backends/jobset_utils.py,sha256=zOxXikz5fRsciTIdEgK3mQNxJfsRVEiW1K6FQI5ZD3Y,23042
11
12
  konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
12
- konduktor/cli.py,sha256=qtktD8N17IRC5MYEdaE0o3pv8EI36cvyyQkYUFi5_nQ,35590
13
+ konduktor/cli.py,sha256=GXmm4DGLHuvfnqDG_3PsWXK7mOI3XWvlC3VeI6JDoqI,36004
13
14
  konduktor/config.py,sha256=J50JxC6MsXMnlrJPXdDUMr38C89xvOO7mR8KJ6fyils,15520
14
15
  konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
15
16
  konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -56,7 +57,7 @@ konduktor/data/constants.py,sha256=yXVEoTI2we1xOjVSU-bjRCQCLpVvpEvJ0GedXvSwEfw,1
56
57
  konduktor/data/data_utils.py,sha256=IG1jgb_La997wi90xCvxYYsHQRlmm8Aooq04ZSf8EDI,9670
57
58
  konduktor/data/gcp/__init__.py,sha256=rlQxACBC_Vu36mdgPyJgUy4mGc_6Nt_a96JAuaPz2pQ,489
58
59
  konduktor/data/gcp/constants.py,sha256=dMfOiFccM8O6rUi9kClJcbvw1K1VnS1JzzQk3apq8ho,1483
59
- konduktor/data/gcp/gcs.py,sha256=Zc1LXrjoeNU9EDK229evrKxjVqsKIUicbtYlugA_TiY,42229
60
+ konduktor/data/gcp/gcs.py,sha256=fFaQydgFj0zJbZrVCrOq7goXE6gT19i3f1NQpe_Hdq4,41888
60
61
  konduktor/data/gcp/utils.py,sha256=FJQcMXZqtMIzjZ98b3lTTc0UbdPUKTDLsOsfJaaH5-s,214
61
62
  konduktor/data/registry.py,sha256=CUbMsN_Q17Pf4wRHkqZrycErEjTP7cLEdgcfwVGcEpc,696
62
63
  konduktor/data/storage.py,sha256=o2So-bY9glvgbGdoN7AQNYmNnvGf1AUDPpImtadRL90,35213
@@ -71,7 +72,7 @@ konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw
71
72
  konduktor/resource.py,sha256=nHgPWXCbWj5sWyslNngrFypMN1K0Dksb0yHbJqWaei8,19612
72
73
  konduktor/task.py,sha256=ofwd8WIhfD6C3ThLcv6X3GUzQHyZ6ddjUagE-umF4K0,35207
73
74
  konduktor/templates/jobset.yaml.j2,sha256=rdURknodtgLp4zoA2PX86Nn4wPpi3tr5l4IG55aWBRg,1059
74
- konduktor/templates/pod.yaml.j2,sha256=7512UVJHv_8mQu0WO9yuOR3HGjWYtHOLe2IsatKeDH0,16635
75
+ konduktor/templates/pod.yaml.j2,sha256=GoBNypXgPkFfmD-a4t0WVCo9sXbgSedvl-YNygso-Fc,17160
75
76
  konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
77
  konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
77
78
  konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -91,8 +92,8 @@ konduktor/utils/schemas.py,sha256=VGPERAso2G4sVAznsJ80qT2Q-I_EFxXw6Rfcw-vkYgQ,16
91
92
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
92
93
  konduktor/utils/ux_utils.py,sha256=czCwiS1bDqgeKtzAJctczpLwFZzAse7WuozdvzEFYJ4,7437
93
94
  konduktor/utils/validator.py,sha256=uCRlScO1NYxsbTNKY9dkoqvlO8S0ISIIB8XmX2ItcO8,2793
94
- konduktor_nightly-0.1.0.dev20250715105209.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
95
- konduktor_nightly-0.1.0.dev20250715105209.dist-info/METADATA,sha256=wJo0dIzmQqEuSfL-9zkBgRtl9f1xVEtcHmtf2o9cdj0,4247
96
- konduktor_nightly-0.1.0.dev20250715105209.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
97
- konduktor_nightly-0.1.0.dev20250715105209.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
98
- konduktor_nightly-0.1.0.dev20250715105209.dist-info/RECORD,,
95
+ konduktor_nightly-0.1.0.dev20250717105242.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
96
+ konduktor_nightly-0.1.0.dev20250717105242.dist-info/METADATA,sha256=Ia8KlWcaZd7H2K18mcrXni62pY2E9mUj9mvxAFG5yf8,4247
97
+ konduktor_nightly-0.1.0.dev20250717105242.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
98
+ konduktor_nightly-0.1.0.dev20250717105242.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
99
+ konduktor_nightly-0.1.0.dev20250717105242.dist-info/RECORD,,