konduktor-nightly 0.1.0.dev20250430104745__tar.gz → 0.1.0.dev20250501104750__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/PKG-INFO +1 -1
  2. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/backends/jobset_utils.py +25 -3
  4. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/templates/pod.yaml.j2 +34 -5
  5. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/exceptions.py +4 -0
  6. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/schemas.py +12 -0
  7. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/ux_utils.py +6 -0
  8. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/pyproject.toml +1 -1
  9. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/LICENSE +0 -0
  10. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/README.md +0 -0
  11. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/adaptors/__init__.py +0 -0
  12. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/adaptors/aws.py +0 -0
  13. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/adaptors/common.py +0 -0
  14. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/adaptors/gcp.py +0 -0
  15. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/backends/__init__.py +0 -0
  16. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/backends/backend.py +0 -0
  17. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/backends/jobset.py +0 -0
  18. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/check.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/cli.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/config.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/constants.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/controller/__init__.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/controller/constants.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/controller/launch.py +0 -0
  25. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/controller/node.py +0 -0
  26. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/controller/parse.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/README.md +0 -0
  28. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/backend/main.py +0 -0
  29. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/backend/sockets.py +0 -0
  30. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  31. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/.gitignore +0 -0
  32. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  33. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  34. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  35. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  36. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  37. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  38. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  39. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  40. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  41. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  42. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  43. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  44. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  45. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  46. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  47. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/globals.css +0 -0
  48. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  49. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/layout.js +0 -0
  50. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  51. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/app/page.js +0 -0
  52. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  53. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  54. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/package-lock.json +0 -0
  55. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/package.json +0 -0
  56. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  57. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/server.js +0 -0
  58. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  59. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/data/__init__.py +0 -0
  60. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/data/aws/__init__.py +0 -0
  61. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/data/aws/s3.py +0 -0
  62. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/data/constants.py +0 -0
  63. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/data/data_utils.py +0 -0
  64. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/data/gcp/__init__.py +0 -0
  65. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/data/gcp/constants.py +0 -0
  66. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/data/gcp/gcs.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/data/gcp/utils.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/data/registry.py +0 -0
  69. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/data/storage.py +0 -0
  70. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/data/storage_utils.py +0 -0
  71. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/execution.py +0 -0
  72. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/kube_client.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/logging.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/manifests/controller_deployment.yaml +0 -0
  75. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  76. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  77. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  78. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/resource.py +0 -0
  79. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/task.py +0 -0
  80. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/templates/jobset.yaml.j2 +0 -0
  81. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/usage/__init__.py +0 -0
  82. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/usage/constants.py +0 -0
  83. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/__init__.py +0 -0
  84. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/accelerator_registry.py +0 -0
  85. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/annotations.py +0 -0
  86. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/base64_utils.py +0 -0
  87. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/common_utils.py +0 -0
  88. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/constants.py +0 -0
  89. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/env_options.py +0 -0
  90. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/kubernetes_enums.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/kubernetes_utils.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/log_utils.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/loki_utils.py +0 -0
  94. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/rich_utils.py +0 -0
  95. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/subprocess_utils.py +0 -0
  96. {konduktor_nightly-0.1.0.dev20250430104745 → konduktor_nightly-0.1.0.dev20250501104750}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250430104745
3
+ Version: 0.1.0.dev20250501104750
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = 'd1d19dd0b1d1e1440aad10115f235e2b6ea95dd7'
17
+ _KONDUKTOR_COMMIT_SHA = 'abd70e24a6d56a34a471a73cbc8ac970f33717c0'
18
18
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
19
19
 
20
20
 
@@ -48,5 +48,5 @@ def _get_git_commit():
48
48
 
49
49
 
50
50
  __commit__ = _get_git_commit()
51
- __version__ = '1.0.0.dev0.1.0.dev20250430104745'
51
+ __version__ = '1.0.0.dev0.1.0.dev20250501104750'
52
52
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -15,9 +15,15 @@ if typing.TYPE_CHECKING:
15
15
  from datetime import timedelta
16
16
 
17
17
  import konduktor
18
- from konduktor import constants, kube_client, logging
18
+ from konduktor import config, constants, kube_client, logging
19
19
  from konduktor.data import registry
20
- from konduktor.utils import common_utils, kubernetes_utils, log_utils
20
+ from konduktor.utils import (
21
+ common_utils,
22
+ exceptions,
23
+ kubernetes_utils,
24
+ log_utils,
25
+ ux_utils,
26
+ )
21
27
 
22
28
  if typing.TYPE_CHECKING:
23
29
  pass
@@ -42,7 +48,7 @@ _JOBSET_METADATA_LABELS = {
42
48
  'jobset_accelerator_label': JOBSET_ACCELERATOR_LABEL,
43
49
  'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
44
50
  }
45
- _RUN_DURATION_ANNOTATION = 'maxRunDurationSeconds'
51
+
46
52
  _RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
47
53
 
48
54
 
@@ -130,6 +136,20 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
130
136
  storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
131
137
  sync_commands.append(cloud_store.make_sync_file_command(src, dst))
132
138
 
139
+ tailscale_secret = config.get_nested(('tailscale', 'secret_name'), None)
140
+ if tailscale_secret:
141
+ context = kubernetes_utils.get_current_kube_config_context_name()
142
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
143
+ secret_exist, err = kubernetes_utils.check_secret_exists(
144
+ tailscale_secret, namespace, context
145
+ )
146
+ if not secret_exist:
147
+ with ux_utils.print_exception_no_traceback():
148
+ raise exceptions.MissingSecretError(
149
+ f'No tailscale auth-key secret `{tailscale_secret}` found even '
150
+ f'though specified by `tailscale.secret_name`: {err}'
151
+ )
152
+
133
153
  assert task.resources is not None, 'Task resources are required'
134
154
  assert task.resources.cpus is not None, 'Task resources cpus are required'
135
155
  assert task.resources.memory is not None, 'Task resources memory are required'
@@ -154,6 +174,8 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
154
174
  'mount_secrets': storage_secrets,
155
175
  'remote_workdir': constants.KONDUKTOR_REMOTE_WORKDIR,
156
176
  'user': common_utils.get_cleaned_username(),
177
+ # Tailscale credentials
178
+ 'tailscale_secret': tailscale_secret,
157
179
  },
158
180
  temp.name,
159
181
  )
@@ -16,8 +16,7 @@ kubernetes:
16
16
  operator: "Exists"
17
17
  {% endif %}
18
18
  containers:
19
- # TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
20
- # TODO(asaiacai): add ulimits
19
+ # TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
21
20
  - name: konduktor-container
22
21
  image: {{ image_id }}
23
22
  # this is set during jobset definition since we need to know the jobset
@@ -39,6 +38,24 @@ kubernetes:
39
38
  value: "{{ num_nodes }}"
40
39
  - name: NUM_GPUS_PER_NODE
41
40
  value: "{{ num_gpus }}"
41
+ {% if tailscale_secret %}
42
+ - name: TS_USERSPACE
43
+ value: "true"
44
+ - name: TS_AUTHKEY
45
+ valueFrom:
46
+ secretKeyRef:
47
+ name: {{ tailscale_secret }}
48
+ key: TS_AUTHKEY
49
+ optional: true
50
+ - name: POD_NAME
51
+ valueFrom:
52
+ fieldRef:
53
+ fieldPath: metadata.name
54
+ - name: POD_UID
55
+ valueFrom:
56
+ fieldRef:
57
+ fieldPath: metadata.uid
58
+ {% endif %}
42
59
  # these are for compatibility with skypilot
43
60
  - name: SKYPILOT_NODE_IPS
44
61
  value: "{{ node_hostnames }}"
@@ -58,7 +75,6 @@ kubernetes:
58
75
  mountPath: /run/konduktor/{{ secret_type }}-secret
59
76
  {% endfor %}
60
77
  command: ["bash", "-c"]
61
- # TODO(asaiacai): should we just mount this as a configmap instead? - Edit: probably not
62
78
  args:
63
79
  - |
64
80
  # TODO(asaiacai): add debug environment variable for printing the apt-update, apt-install, sync-files output
@@ -76,7 +92,7 @@ kubernetes:
76
92
  {% if 'rsync' in run_cmd %}
77
93
  PACKAGES="$PACKAGES rsync";
78
94
  {% endif %}
79
- {% if 'curl' in run_cmd %}
95
+ {% if 'curl' in run_cmd or tailscale_secret %}
80
96
  PACKAGES="$PACKAGES curl";
81
97
  {% endif %}
82
98
  {% if 'gs' in mount_secrets or 's3' in mount_secrets %}
@@ -117,6 +133,17 @@ kubernetes:
117
133
  DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES >> ~/.konduktor/tmp/apt-install.log;
118
134
  fi;
119
135
  end_epoch=$(date +%s);
136
+
137
+ {% if tailscale_secret %}
138
+ if ! command -v tailscale >/dev/null 2>&1; then
139
+ export TS_HOSTNAME=$(echo "$POD_NAME" | sed 's/-[^-]*$//')
140
+ $(prefix_cmd) curl -fsSL https://tailscale.com/install.sh | DEBIAN_FRONTEND=noninteractive $(prefix_cmd) sh >> ~/.konduktor/tmp/tailscale-install.log
141
+ $(prefix_cmd) tailscaled --tun=userspace-networking >/dev/null 2>&1 &
142
+ $(prefix_cmd) tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME} >/dev/null 2>&1
143
+ fi
144
+ {% endif %}
145
+ end_epoch=$(date +%s);
146
+
120
147
  $(prefix_cmd) echo "===== KONDUKTOR: Installing packages took $((end_epoch - start_epoch)) seconds ====="
121
148
 
122
149
  # unpack secrets credentials
@@ -153,6 +180,7 @@ kubernetes:
153
180
  {% endif %}
154
181
  end_epoch=$(date +%s);
155
182
  end_setup_time=$((end_epoch - start_setup));
183
+ ulimit -Sc 0 && ulimit -Hc 0
156
184
  $(prefix_cmd) echo "===== KONDUKTOR: Initialization took $end_setup_time seconds ====="
157
185
  # run task
158
186
  $(prefix_cmd) cd {{ remote_workdir }}
@@ -180,7 +208,8 @@ kubernetes:
180
208
  {% endif %}
181
209
  securityContext:
182
210
  capabilities:
183
- add: ["IPC_LOCK"] # May be needed for memlock
211
+ add:
212
+ - "IPC_LOCK" # May be needed for memlock
184
213
 
185
214
  volumes:
186
215
  - name: shared-memory
@@ -130,6 +130,10 @@ class CommandError(Exception):
130
130
  pass
131
131
 
132
132
 
133
+ class MissingSecretError(Exception):
134
+ pass
135
+
136
+
133
137
  class NotSupportedError(Exception):
134
138
  """Raised when a feature is not supported."""
135
139
 
@@ -563,6 +563,17 @@ def get_config_schema():
563
563
  },
564
564
  }
565
565
 
566
+ tailscale_configs = {
567
+ 'type': 'object',
568
+ 'required': [],
569
+ 'additionalProperties': False,
570
+ 'properties': {
571
+ 'secret_name': {
572
+ 'type': 'string',
573
+ },
574
+ },
575
+ }
576
+
566
577
  for cloud, config in cloud_configs.items():
567
578
  if cloud == 'kubernetes':
568
579
  config['properties'].update(_REMOTE_IDENTITY_SCHEMA_KUBERNETES)
@@ -577,6 +588,7 @@ def get_config_schema():
577
588
  'admin_policy': admin_policy_schema,
578
589
  'nvidia_gpus': gpu_configs,
579
590
  'allowed_clouds': allowed_clouds,
591
+ 'tailscale': tailscale_configs,
580
592
  **cloud_configs,
581
593
  },
582
594
  }
@@ -11,6 +11,7 @@ from typing import Callable, Optional, Union
11
11
  import colorama
12
12
  import rich.console as rich_console
13
13
 
14
+ from konduktor import config
14
15
  from konduktor import logging as konduktor_logging
15
16
 
16
17
  if typing.TYPE_CHECKING:
@@ -211,6 +212,11 @@ def command_hint_messages(hint_type: CommandHintType, job_id: str) -> str:
211
212
  )
212
213
  hint_str = '\n📋 Useful Commands'
213
214
  hint_str += f'{job_hint_str}'
215
+ if config.get_nested(('tailscale', 'secret_name'), None) is not None:
216
+ hint_str += (
217
+ f'\n{INDENT_SYMBOL}To tailscale ssh:\t\t'
218
+ f'{BOLD}ssh root@{job_id}-workers-0-0 {RESET_BOLD}'
219
+ )
214
220
  return hint_str
215
221
  else:
216
222
  raise ValueError(f'Invalid hint type: {hint_type}')
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "konduktor-nightly"
3
- version = "0.1.0.dev20250430104745"
3
+ version = "0.1.0.dev20250501104750"
4
4
  description = "GPU Cluster Health Management"
5
5
  packages = [
6
6
  {include = "konduktor"}