konduktor-nightly 0.1.0.dev20251030104830__tar.gz → 0.1.0.dev20251124105105__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/PKG-INFO +1 -1
  2. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/backends/constants.py +1 -0
  4. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/backends/deployment.py +13 -2
  5. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/backends/deployment_utils.py +3 -3
  6. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/backends/jobset_utils.py +2 -1
  7. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/backends/pod_utils.py +136 -19
  8. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/cli.py +171 -132
  9. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/manifests/aibrix-setup.yaml +157 -1
  10. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/manifests/apoxy-setup2.yaml +1 -1
  11. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/templates/deployment.yaml.j2 +5 -3
  12. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/templates/pod.yaml.j2 +123 -9
  13. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/base64_utils.py +2 -0
  14. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/log_utils.py +8 -5
  15. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/pyproject.toml +1 -1
  16. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/LICENSE +0 -0
  17. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/README.md +0 -0
  18. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/adaptors/__init__.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/adaptors/aws.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/adaptors/common.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/adaptors/gcp.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/authentication.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/backends/__init__.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/backends/backend.py +0 -0
  25. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/backends/jobset.py +0 -0
  26. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/check.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/config.py +0 -0
  28. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/constants.py +0 -0
  29. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/controller/__init__.py +0 -0
  30. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/controller/constants.py +0 -0
  31. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/controller/launch.py +0 -0
  32. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/controller/node.py +0 -0
  33. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/controller/parse.py +0 -0
  34. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/README.md +0 -0
  35. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/backend/main.py +0 -0
  36. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/backend/sockets.py +0 -0
  37. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  38. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/.gitignore +0 -0
  39. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  40. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  41. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  42. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  43. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  44. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  45. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  46. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  47. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  48. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  49. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  50. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  51. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  52. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  53. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  54. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/globals.css +0 -0
  55. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  56. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/layout.js +0 -0
  57. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  58. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/app/page.js +0 -0
  59. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  60. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  61. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/package-lock.json +0 -0
  62. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/package.json +0 -0
  63. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  64. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/server.js +0 -0
  65. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  66. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/data/__init__.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/data/aws/__init__.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/data/aws/s3.py +0 -0
  69. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/data/constants.py +0 -0
  70. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/data/data_utils.py +0 -0
  71. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/data/gcp/__init__.py +0 -0
  72. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/data/gcp/constants.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/data/gcp/gcs.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/data/gcp/utils.py +0 -0
  75. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/data/registry.py +0 -0
  76. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/data/storage.py +0 -0
  77. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/data/storage_utils.py +0 -0
  78. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/execution.py +0 -0
  79. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/kube_client.py +0 -0
  80. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/logging.py +0 -0
  81. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/manifests/apoxy-setup.yaml +0 -0
  82. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/manifests/controller_deployment.yaml +0 -0
  83. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  84. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  85. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  86. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/resource.py +0 -0
  87. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/serving.py +0 -0
  88. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/task.py +0 -0
  89. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/templates/jobset.yaml.j2 +0 -0
  90. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/usage/__init__.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/usage/constants.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/__init__.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/accelerator_registry.py +0 -0
  94. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/annotations.py +0 -0
  95. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/common_utils.py +0 -0
  96. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/constants.py +0 -0
  97. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/env_options.py +0 -0
  98. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/exceptions.py +0 -0
  99. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/kubernetes_enums.py +0 -0
  100. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/kubernetes_utils.py +0 -0
  101. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/loki_utils.py +0 -0
  102. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/rich_utils.py +0 -0
  103. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/schemas.py +0 -0
  104. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/subprocess_utils.py +0 -0
  105. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/ux_utils.py +0 -0
  106. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251124105105}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20251030104830
3
+ Version: 0.1.0.dev20251124105105
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = '2d0baca8b432c156afdb9cc49add78305fe0ed21'
14
+ _KONDUKTOR_COMMIT_SHA = '4837817a46660c45c449eec1cf69ac90ba5b8390'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20251030104830'
48
+ __version__ = '1.0.0.dev0.1.0.dev20251124105105'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -18,3 +18,4 @@ STOP_USERNAME_LABEL = 'trainy.ai/stop-username'
18
18
  SECRET_BASENAME_LABEL = 'trainy.ai/secret-basename'
19
19
  SECRET_KIND_LABEL = 'trainy.ai/secret-kind'
20
20
  SECRET_OWNER_LABEL = 'trainy.ai/secret-owner'
21
+ ROOT_NAME = 'trainy.ai/root-name'
@@ -54,8 +54,19 @@ def _wait_for_all_ready(namespace: str, name: str):
54
54
  except ApiException:
55
55
  services_map = {}
56
56
 
57
- autoscaler = deployment_utils.get_autoscaler(namespace, name)
58
- autoscalers_map = {name: autoscaler} if autoscaler else {}
57
+ autoscalers_map = {}
58
+ try:
59
+ autoscaler_obj = deployment_utils.get_autoscaler(namespace, name)
60
+ if autoscaler_obj:
61
+ # detect aibrix vs general from deployment labels
62
+ labels = (deployment.metadata.labels or {}) if deployment else {}
63
+ is_aibrix = deployment_utils.AIBRIX_NAME_LABEL in labels
64
+ if is_aibrix:
65
+ autoscalers_map[name] = {'kpa': autoscaler_obj}
66
+ else:
67
+ autoscalers_map[name] = {'hpa': autoscaler_obj}
68
+ except ApiException:
69
+ pass
59
70
 
60
71
  status = deployment_utils.get_model_status(
61
72
  name, deployments_map, services_map, autoscalers_map
@@ -998,13 +998,13 @@ def get_envoy_external_ip() -> Optional[str]:
998
998
 
999
999
 
1000
1000
  def get_ingress_nginx_external_ip() -> Optional[str]:
1001
- """Get the external IP of the ingress-nginx-controller LoadBalancer."""
1001
+ """Get the external IP of the keda-ingress-nginx-controller LoadBalancer."""
1002
1002
  context = kubernetes_utils.get_current_kube_config_context_name()
1003
1003
  core_api = kube_client.core_api(context=context)
1004
1004
  try:
1005
- # Look for ingress-nginx-controller service in keda namespace
1005
+ # Look for keda-ingress-nginx-controller service in keda namespace
1006
1006
  service = core_api.read_namespaced_service(
1007
- name='ingress-nginx-controller', namespace='keda'
1007
+ name='keda-ingress-nginx-controller', namespace='keda'
1008
1008
  )
1009
1009
  if service.spec.type == 'LoadBalancer':
1010
1010
  ingress = service.status.load_balancer.ingress
@@ -449,7 +449,8 @@ def _format_timestamp(timestamp: str) -> str:
449
449
 
450
450
 
451
451
  def _get_job_start_time(job: Dict[str, Any]) -> str:
452
- for condition in job['status'].get('conditions', []):
452
+ status = job.get('status', {})
453
+ for condition in status.get('conditions', []):
453
454
  if condition['reason'] == 'ResumeJobs':
454
455
  return condition.get('lastTransitionTime', '')
455
456
  return '-'
@@ -153,7 +153,9 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
153
153
  git_ssh_secret_name = None
154
154
  env_secret_envs = []
155
155
  default_secrets = []
156
+ basename_by_k8s: Dict[str, str] = {}
156
157
 
158
+ # only get own secrets
157
159
  user_hash = common_utils.get_user_hash()
158
160
  label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
159
161
  user_secrets = kubernetes_utils.list_secrets(
@@ -162,19 +164,36 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
162
164
 
163
165
  for secret in user_secrets:
164
166
  kind = kubernetes_utils.get_secret_kind(secret)
167
+
168
+ # incase the user modified their secret to have no key:value data
169
+ if secret.data is None:
170
+ secret.data = {}
171
+
172
+ # fill the map for *all* secrets we see
173
+ k8s_name = secret.metadata.name
174
+ lbls = secret.metadata.labels or {}
175
+ base = lbls.get(
176
+ backend_constants.SECRET_BASENAME_LABEL,
177
+ # fallback: strip trailing "-<something>" once if present
178
+ k8s_name.rsplit('-', 1)[0] if '-' in k8s_name else k8s_name,
179
+ )
180
+ basename_by_k8s[k8s_name] = base
181
+
165
182
  if kind == 'git-ssh' and git_ssh_secret_name is None:
166
183
  git_ssh_secret_name = secret.metadata.name
167
184
  elif kind == 'env':
168
185
  env_secret_name = secret.metadata.name
169
- key = next(iter(secret.data))
170
- env_secret_envs.append(
171
- {
172
- 'name': key,
173
- 'valueFrom': {
174
- 'secretKeyRef': {'name': env_secret_name, 'key': key}
175
- },
176
- }
177
- )
186
+ # iterate ALL keys, not just one (ex. if user made a multi-key env secret)
187
+ for key, _ in secret.data.items():
188
+ # wire the env var to read its value from a k8s secret
189
+ env_secret_envs.append(
190
+ {
191
+ 'name': key,
192
+ 'valueFrom': {
193
+ 'secretKeyRef': {'name': env_secret_name, 'key': key}
194
+ },
195
+ }
196
+ )
178
197
  elif kind == 'default':
179
198
  default_secret_name = secret.metadata.name
180
199
  basename = secret.metadata.labels.get(
@@ -184,6 +203,22 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
184
203
  {'k8s_name': default_secret_name, 'mount_name': basename}
185
204
  )
186
205
 
206
+ # Check if the task references KONDUKTOR_DEFAULT_SECRETS and that it exists
207
+ uses_default_secret_var = (
208
+ 'KONDUKTOR_DEFAULT_SECRETS' in (task.run or '')
209
+ or 'KONDUKTOR_DEFAULT_SECRETS' in (task.setup or '')
210
+ or '/konduktor/default-secrets/' in (task.run or '')
211
+ or '/konduktor/default-secrets/' in (task.setup or '')
212
+ )
213
+ if uses_default_secret_var and not default_secrets:
214
+ raise exceptions.MissingSecretError(
215
+ f'Task references KONDUKTOR_DEFAULT_SECRETS or '
216
+ f'/konduktor/default-secrets but '
217
+ f'user {common_utils.get_cleaned_username()} '
218
+ f'has no default secrets. Paths like '
219
+ f'$KONDUKTOR_DEFAULT_SECRETS/<secret_name>/... will not exist.'
220
+ )
221
+
187
222
  # Inject --served-model-name, --host, and --port into serving run command
188
223
  if task.serving and task.run and 'vllm.entrypoints.openai.api_server' in task.run:
189
224
  if '--served-model-name' and '--host' and '--port' not in task.run:
@@ -262,31 +297,111 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
262
297
  },
263
298
  temp.name,
264
299
  )
300
+
301
+ # Capture the template env names BEFORE user config is merged
302
+ pod_config_template = common_utils.read_yaml(temp.name)
303
+ tmpl_envs = pod_config_template['kubernetes']['pod_config']['spec'][
304
+ 'containers'
305
+ ][0].get('env', [])
306
+ tmpl_env_names = {e['name'] for e in tmpl_envs}
307
+
265
308
  pod_config = common_utils.read_yaml(temp.name)
266
- # merge with `~/.konduktor/config.yaml``
309
+ # merge with `~/.konduktor/config.yaml`` (config.yaml overrides template)
267
310
  kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
268
311
  pod_config = common_utils.read_yaml(temp.name)
269
312
 
270
- # Priority order: task.envs > secret envs > existing pod_config envs
271
- existing_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
313
+ # Find what came from user config (appeared after combine, not in template)
314
+ premerge_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
272
315
  'env', []
273
316
  )
274
- env_map = {env['name']: env for env in existing_envs}
317
+ premerge_names = {e['name'] for e in premerge_envs}
318
+ config_env_names0 = premerge_names - tmpl_env_names
275
319
 
276
- # Inject secret envs
320
+ # Build final env list
321
+ env_map = {env['name']: env for env in premerge_envs}
322
+
323
+ # Inject secret envs (env secrets override config.yaml)
277
324
  for env in env_secret_envs:
278
325
  env_map[env['name']] = env
279
326
 
280
- # Inject task.envs
327
+ # Inject task envs
328
+ # CLI+task.yaml overrides everything else
329
+ # CLI already overrode task.yaml in other code
281
330
  for k, v in task.envs.items():
282
331
  env_map[k] = {'name': k, 'value': v}
283
332
 
284
- # Replace the container's env section with the merged and prioritized map
285
- pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = list(
286
- env_map.values()
333
+ final_envs_list = list(env_map.values())
334
+ pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = (
335
+ final_envs_list
287
336
  )
337
+ container = pod_config['kubernetes']['pod_config']['spec']['containers'][0]
338
+ final_envs = container['env']
339
+ final_names = {e['name'] for e in final_envs}
340
+
288
341
  logger.debug(f'rendered pod spec: \n\t{json.dumps(pod_config, indent=2)}')
289
342
 
343
+ # 1) Get secret envs actually used in the final env list
344
+ secret_details = sorted(
345
+ (e['name'], e['valueFrom']['secretKeyRef']['name'])
346
+ for e in final_envs
347
+ if isinstance(e, dict)
348
+ and e.get('valueFrom', {})
349
+ and e['valueFrom'].get('secretKeyRef')
350
+ )
351
+ secret_names = [n for n, _ in secret_details]
352
+
353
+ # 2) Get task-sourced (CLI+task.yaml) envs actually used in the final env list
354
+ task_all_names = sorted(
355
+ n
356
+ for n in (task.envs or {}).keys()
357
+ if n in final_names and n not in secret_names
358
+ )
359
+
360
+ # 3) Get Config.yaml envs actually used in the final env list
361
+ config_names = sorted(
362
+ n
363
+ for n in config_env_names0
364
+ if n in final_names and n not in secret_names and n not in task_all_names
365
+ )
366
+
367
+ # 4) Get other envs (template/system) actually used in the final env list
368
+ other_names = sorted(
369
+ final_names - set(secret_names) - set(task_all_names) - set(config_names)
370
+ )
371
+
372
+ # Export helper envs for the startup script (names only)
373
+ def _append_helper(name: str, values):
374
+ container['env'].append({'name': name, 'value': ','.join(values)})
375
+
376
+ # to show user basenames of k8s secrets instead of actual
377
+ # k8s secret names (which have added suffixes)
378
+ secret_map_pairs = [
379
+ f'{var}={basename_by_k8s.get(secret_k8s, secret_k8s)}'
380
+ for (var, secret_k8s) in secret_details
381
+ ]
382
+
383
+ # Priority order: CLI > task.yaml > env secret > config > template/system
384
+ _append_helper(
385
+ 'KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION',
386
+ secret_names,
387
+ )
388
+ _append_helper(
389
+ 'KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION',
390
+ secret_map_pairs,
391
+ )
392
+ _append_helper(
393
+ 'KONDUKTOR_ENV_TASK_ALL_HOPEFULLY_NO_NAME_COLLISION',
394
+ task_all_names,
395
+ )
396
+ _append_helper(
397
+ 'KONDUKTOR_ENV_CONFIG_HOPEFULLY_NO_NAME_COLLISION',
398
+ config_names,
399
+ )
400
+ _append_helper(
401
+ 'KONDUKTOR_ENV_OTHER_HOPEFULLY_NO_NAME_COLLISION',
402
+ other_names,
403
+ )
404
+
290
405
  # validate pod spec using json schema
291
406
  try:
292
407
  validator.validate_pod_spec(pod_config['kubernetes']['pod_config']['spec'])
@@ -357,7 +472,9 @@ def inject_jobset_metadata(jobset_spec: Dict[str, Any], task: 'konduktor.Task')
357
472
  task: The task object containing resource information
358
473
  """
359
474
  # Add max run duration annotation
360
- assert task.resources is not None and task.resources.labels is not None
475
+ assert (
476
+ task.resources is not None and task.resources.labels is not None
477
+ ), 'Task resources and task.resources.labels are required'
361
478
  maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
362
479
  if not maxRunDurationSeconds:
363
480
  raise ValueError('maxRunDurationSeconds is required')