konduktor-nightly 0.1.0.dev20251030104830__tar.gz → 0.1.0.dev20251211105235__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/PKG-INFO +2 -1
  2. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/constants.py +1 -0
  4. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/deployment.py +13 -2
  5. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/deployment_utils.py +3 -3
  6. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/jobset_utils.py +2 -1
  7. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/pod_utils.py +147 -27
  8. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/cli.py +303 -301
  9. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/aibrix-setup.yaml +157 -1
  10. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/apoxy-setup2.yaml +1 -1
  11. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/templates/deployment.yaml.j2 +5 -3
  12. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/templates/pod.yaml.j2 +123 -9
  13. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/base64_utils.py +2 -0
  14. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/log_utils.py +8 -5
  15. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/pyproject.toml +2 -1
  16. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/LICENSE +0 -0
  17. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/README.md +0 -0
  18. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/adaptors/__init__.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/adaptors/aws.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/adaptors/common.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/adaptors/gcp.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/authentication.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/__init__.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/backend.py +0 -0
  25. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/jobset.py +0 -0
  26. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/check.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/config.py +0 -0
  28. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/constants.py +0 -0
  29. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/__init__.py +0 -0
  30. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/constants.py +0 -0
  31. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/launch.py +0 -0
  32. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/node.py +0 -0
  33. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/parse.py +0 -0
  34. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/README.md +0 -0
  35. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/backend/main.py +0 -0
  36. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/backend/sockets.py +0 -0
  37. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  38. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/.gitignore +0 -0
  39. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  40. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  41. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  42. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  43. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  44. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  45. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  46. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  47. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  48. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  49. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  50. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  51. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  52. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  53. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  54. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/globals.css +0 -0
  55. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  56. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/layout.js +0 -0
  57. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  58. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/page.js +0 -0
  59. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  60. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  61. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/package-lock.json +0 -0
  62. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/package.json +0 -0
  63. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  64. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/server.js +0 -0
  65. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  66. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/__init__.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/aws/__init__.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/aws/s3.py +0 -0
  69. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/constants.py +0 -0
  70. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/data_utils.py +0 -0
  71. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/gcp/__init__.py +0 -0
  72. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/gcp/constants.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/gcp/gcs.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/gcp/utils.py +0 -0
  75. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/registry.py +0 -0
  76. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/storage.py +0 -0
  77. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/storage_utils.py +0 -0
  78. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/execution.py +0 -0
  79. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/kube_client.py +0 -0
  80. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/logging.py +0 -0
  81. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/apoxy-setup.yaml +0 -0
  82. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/controller_deployment.yaml +0 -0
  83. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  84. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  85. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  86. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/resource.py +0 -0
  87. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/serving.py +0 -0
  88. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/task.py +0 -0
  89. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/templates/jobset.yaml.j2 +0 -0
  90. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/usage/__init__.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/usage/constants.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/__init__.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/accelerator_registry.py +0 -0
  94. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/annotations.py +0 -0
  95. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/common_utils.py +0 -0
  96. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/constants.py +0 -0
  97. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/env_options.py +0 -0
  98. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/exceptions.py +0 -0
  99. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/kubernetes_enums.py +0 -0
  100. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/kubernetes_utils.py +0 -0
  101. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/loki_utils.py +0 -0
  102. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/rich_utils.py +0 -0
  103. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/schemas.py +0 -0
  104. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/subprocess_utils.py +0 -0
  105. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/ux_utils.py +0 -0
  106. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20251030104830
3
+ Version: 0.1.0.dev20251211105235
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -29,6 +29,7 @@ Requires-Dist: prettytable (>=3.12.0,<4.0.0)
29
29
  Requires-Dist: psutil (>=7.0.0,<8.0.0)
30
30
  Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
31
31
  Requires-Dist: rich (>=13.9.4,<14.0.0)
32
+ Requires-Dist: sniffio (>=1.3,<2.0)
32
33
  Requires-Dist: websockets (>=15.0.1,<16.0.0)
33
34
  Description-Content-Type: text/markdown
34
35
 
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = '2d0baca8b432c156afdb9cc49add78305fe0ed21'
14
+ _KONDUKTOR_COMMIT_SHA = '421390595e3a1b9f263e790323deae61d94da231'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20251030104830'
48
+ __version__ = '1.0.0.dev0.1.0.dev20251211105235'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -18,3 +18,4 @@ STOP_USERNAME_LABEL = 'trainy.ai/stop-username'
18
18
  SECRET_BASENAME_LABEL = 'trainy.ai/secret-basename'
19
19
  SECRET_KIND_LABEL = 'trainy.ai/secret-kind'
20
20
  SECRET_OWNER_LABEL = 'trainy.ai/secret-owner'
21
+ ROOT_NAME = 'trainy.ai/root-name'
@@ -54,8 +54,19 @@ def _wait_for_all_ready(namespace: str, name: str):
54
54
  except ApiException:
55
55
  services_map = {}
56
56
 
57
- autoscaler = deployment_utils.get_autoscaler(namespace, name)
58
- autoscalers_map = {name: autoscaler} if autoscaler else {}
57
+ autoscalers_map = {}
58
+ try:
59
+ autoscaler_obj = deployment_utils.get_autoscaler(namespace, name)
60
+ if autoscaler_obj:
61
+ # detect aibrix vs general from deployment labels
62
+ labels = (deployment.metadata.labels or {}) if deployment else {}
63
+ is_aibrix = deployment_utils.AIBRIX_NAME_LABEL in labels
64
+ if is_aibrix:
65
+ autoscalers_map[name] = {'kpa': autoscaler_obj}
66
+ else:
67
+ autoscalers_map[name] = {'hpa': autoscaler_obj}
68
+ except ApiException:
69
+ pass
59
70
 
60
71
  status = deployment_utils.get_model_status(
61
72
  name, deployments_map, services_map, autoscalers_map
@@ -998,13 +998,13 @@ def get_envoy_external_ip() -> Optional[str]:
998
998
 
999
999
 
1000
1000
  def get_ingress_nginx_external_ip() -> Optional[str]:
1001
- """Get the external IP of the ingress-nginx-controller LoadBalancer."""
1001
+ """Get the external IP of the keda-ingress-nginx-controller LoadBalancer."""
1002
1002
  context = kubernetes_utils.get_current_kube_config_context_name()
1003
1003
  core_api = kube_client.core_api(context=context)
1004
1004
  try:
1005
- # Look for ingress-nginx-controller service in keda namespace
1005
+ # Look for keda-ingress-nginx-controller service in keda namespace
1006
1006
  service = core_api.read_namespaced_service(
1007
- name='ingress-nginx-controller', namespace='keda'
1007
+ name='keda-ingress-nginx-controller', namespace='keda'
1008
1008
  )
1009
1009
  if service.spec.type == 'LoadBalancer':
1010
1010
  ingress = service.status.load_balancer.ingress
@@ -449,7 +449,8 @@ def _format_timestamp(timestamp: str) -> str:
449
449
 
450
450
 
451
451
  def _get_job_start_time(job: Dict[str, Any]) -> str:
452
- for condition in job['status'].get('conditions', []):
452
+ status = job.get('status', {})
453
+ for condition in status.get('conditions', []):
453
454
  if condition['reason'] == 'ResumeJobs':
454
455
  return condition.get('lastTransitionTime', '')
455
456
  return '-'
@@ -28,6 +28,8 @@ if typing.TYPE_CHECKING:
28
28
  logger = logging.get_logger(__name__)
29
29
 
30
30
  _RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
31
+ # Use a large default (7 days) to mimic "infinite" runtime.
32
+ _DEFAULT_MAX_RUN_DURATION_SECONDS = 604800
31
33
 
32
34
 
33
35
  def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
@@ -153,7 +155,9 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
153
155
  git_ssh_secret_name = None
154
156
  env_secret_envs = []
155
157
  default_secrets = []
158
+ basename_by_k8s: Dict[str, str] = {}
156
159
 
160
+ # only get own secrets
157
161
  user_hash = common_utils.get_user_hash()
158
162
  label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
159
163
  user_secrets = kubernetes_utils.list_secrets(
@@ -162,19 +166,36 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
162
166
 
163
167
  for secret in user_secrets:
164
168
  kind = kubernetes_utils.get_secret_kind(secret)
169
+
170
+ # incase the user modified their secret to have no key:value data
171
+ if secret.data is None:
172
+ secret.data = {}
173
+
174
+ # fill the map for *all* secrets we see
175
+ k8s_name = secret.metadata.name
176
+ lbls = secret.metadata.labels or {}
177
+ base = lbls.get(
178
+ backend_constants.SECRET_BASENAME_LABEL,
179
+ # fallback: strip trailing "-<something>" once if present
180
+ k8s_name.rsplit('-', 1)[0] if '-' in k8s_name else k8s_name,
181
+ )
182
+ basename_by_k8s[k8s_name] = base
183
+
165
184
  if kind == 'git-ssh' and git_ssh_secret_name is None:
166
185
  git_ssh_secret_name = secret.metadata.name
167
186
  elif kind == 'env':
168
187
  env_secret_name = secret.metadata.name
169
- key = next(iter(secret.data))
170
- env_secret_envs.append(
171
- {
172
- 'name': key,
173
- 'valueFrom': {
174
- 'secretKeyRef': {'name': env_secret_name, 'key': key}
175
- },
176
- }
177
- )
188
+ # iterate ALL keys, not just one (ex. if user made a multi-key env secret)
189
+ for key, _ in secret.data.items():
190
+ # wire the env var to read its value from a k8s secret
191
+ env_secret_envs.append(
192
+ {
193
+ 'name': key,
194
+ 'valueFrom': {
195
+ 'secretKeyRef': {'name': env_secret_name, 'key': key}
196
+ },
197
+ }
198
+ )
178
199
  elif kind == 'default':
179
200
  default_secret_name = secret.metadata.name
180
201
  basename = secret.metadata.labels.get(
@@ -184,6 +205,22 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
184
205
  {'k8s_name': default_secret_name, 'mount_name': basename}
185
206
  )
186
207
 
208
+ # Check if the task references KONDUKTOR_DEFAULT_SECRETS and that it exists
209
+ uses_default_secret_var = (
210
+ 'KONDUKTOR_DEFAULT_SECRETS' in (task.run or '')
211
+ or 'KONDUKTOR_DEFAULT_SECRETS' in (task.setup or '')
212
+ or '/konduktor/default-secrets/' in (task.run or '')
213
+ or '/konduktor/default-secrets/' in (task.setup or '')
214
+ )
215
+ if uses_default_secret_var and not default_secrets:
216
+ raise exceptions.MissingSecretError(
217
+ f'Task references KONDUKTOR_DEFAULT_SECRETS or '
218
+ f'/konduktor/default-secrets but '
219
+ f'user {common_utils.get_cleaned_username()} '
220
+ f'has no default secrets. Paths like '
221
+ f'$KONDUKTOR_DEFAULT_SECRETS/<secret_name>/... will not exist.'
222
+ )
223
+
187
224
  # Inject --served-model-name, --host, and --port into serving run command
188
225
  if task.serving and task.run and 'vllm.entrypoints.openai.api_server' in task.run:
189
226
  if '--served-model-name' and '--host' and '--port' not in task.run:
@@ -262,31 +299,111 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
262
299
  },
263
300
  temp.name,
264
301
  )
302
+
303
+ # Capture the template env names BEFORE user config is merged
304
+ pod_config_template = common_utils.read_yaml(temp.name)
305
+ tmpl_envs = pod_config_template['kubernetes']['pod_config']['spec'][
306
+ 'containers'
307
+ ][0].get('env', [])
308
+ tmpl_env_names = {e['name'] for e in tmpl_envs}
309
+
265
310
  pod_config = common_utils.read_yaml(temp.name)
266
- # merge with `~/.konduktor/config.yaml``
311
+ # merge with `~/.konduktor/config.yaml`` (config.yaml overrides template)
267
312
  kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
268
313
  pod_config = common_utils.read_yaml(temp.name)
269
314
 
270
- # Priority order: task.envs > secret envs > existing pod_config envs
271
- existing_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
315
+ # Find what came from user config (appeared after combine, not in template)
316
+ premerge_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
272
317
  'env', []
273
318
  )
274
- env_map = {env['name']: env for env in existing_envs}
319
+ premerge_names = {e['name'] for e in premerge_envs}
320
+ config_env_names0 = premerge_names - tmpl_env_names
321
+
322
+ # Build final env list
323
+ env_map = {env['name']: env for env in premerge_envs}
275
324
 
276
- # Inject secret envs
325
+ # Inject secret envs (env secrets override config.yaml)
277
326
  for env in env_secret_envs:
278
327
  env_map[env['name']] = env
279
328
 
280
- # Inject task.envs
329
+ # Inject task envs
330
+ # CLI+task.yaml overrides everything else
331
+ # CLI already overrode task.yaml in other code
281
332
  for k, v in task.envs.items():
282
333
  env_map[k] = {'name': k, 'value': v}
283
334
 
284
- # Replace the container's env section with the merged and prioritized map
285
- pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = list(
286
- env_map.values()
335
+ final_envs_list = list(env_map.values())
336
+ pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = (
337
+ final_envs_list
287
338
  )
339
+ container = pod_config['kubernetes']['pod_config']['spec']['containers'][0]
340
+ final_envs = container['env']
341
+ final_names = {e['name'] for e in final_envs}
342
+
288
343
  logger.debug(f'rendered pod spec: \n\t{json.dumps(pod_config, indent=2)}')
289
344
 
345
+ # 1) Get secret envs actually used in the final env list
346
+ secret_details = sorted(
347
+ (e['name'], e['valueFrom']['secretKeyRef']['name'])
348
+ for e in final_envs
349
+ if isinstance(e, dict)
350
+ and e.get('valueFrom', {})
351
+ and e['valueFrom'].get('secretKeyRef')
352
+ )
353
+ secret_names = [n for n, _ in secret_details]
354
+
355
+ # 2) Get task-sourced (CLI+task.yaml) envs actually used in the final env list
356
+ task_all_names = sorted(
357
+ n
358
+ for n in (task.envs or {}).keys()
359
+ if n in final_names and n not in secret_names
360
+ )
361
+
362
+ # 3) Get Config.yaml envs actually used in the final env list
363
+ config_names = sorted(
364
+ n
365
+ for n in config_env_names0
366
+ if n in final_names and n not in secret_names and n not in task_all_names
367
+ )
368
+
369
+ # 4) Get other envs (template/system) actually used in the final env list
370
+ other_names = sorted(
371
+ final_names - set(secret_names) - set(task_all_names) - set(config_names)
372
+ )
373
+
374
+ # Export helper envs for the startup script (names only)
375
+ def _append_helper(name: str, values):
376
+ container['env'].append({'name': name, 'value': ','.join(values)})
377
+
378
+ # to show user basenames of k8s secrets instead of actual
379
+ # k8s secret names (which have added suffixes)
380
+ secret_map_pairs = [
381
+ f'{var}={basename_by_k8s.get(secret_k8s, secret_k8s)}'
382
+ for (var, secret_k8s) in secret_details
383
+ ]
384
+
385
+ # Priority order: CLI > task.yaml > env secret > config > template/system
386
+ _append_helper(
387
+ 'KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION',
388
+ secret_names,
389
+ )
390
+ _append_helper(
391
+ 'KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION',
392
+ secret_map_pairs,
393
+ )
394
+ _append_helper(
395
+ 'KONDUKTOR_ENV_TASK_ALL_HOPEFULLY_NO_NAME_COLLISION',
396
+ task_all_names,
397
+ )
398
+ _append_helper(
399
+ 'KONDUKTOR_ENV_CONFIG_HOPEFULLY_NO_NAME_COLLISION',
400
+ config_names,
401
+ )
402
+ _append_helper(
403
+ 'KONDUKTOR_ENV_OTHER_HOPEFULLY_NO_NAME_COLLISION',
404
+ other_names,
405
+ )
406
+
290
407
  # validate pod spec using json schema
291
408
  try:
292
409
  validator.validate_pod_spec(pod_config['kubernetes']['pod_config']['spec'])
@@ -356,18 +473,21 @@ def inject_jobset_metadata(jobset_spec: Dict[str, Any], task: 'konduktor.Task')
356
473
  jobset_spec: The JobSet spec dictionary to modify
357
474
  task: The task object containing resource information
358
475
  """
359
- # Add max run duration annotation
360
- assert task.resources is not None and task.resources.labels is not None
361
- maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
362
- if not maxRunDurationSeconds:
363
- raise ValueError('maxRunDurationSeconds is required')
364
- jobset_spec['jobset']['metadata']['annotations'][_RUN_DURATION_ANNOTATION_KEY] = (
365
- str(maxRunDurationSeconds)
476
+ assert task.resources is not None, 'Task resources are required'
477
+ labels = task.resources.labels or {}
478
+
479
+ # Add max run duration annotation, defaulting to a practically infinite value.
480
+ maxRunDurationSeconds = labels.get('maxRunDurationSeconds')
481
+ metadata = jobset_spec['jobset']['metadata']
482
+ metadata.setdefault('annotations', {})[_RUN_DURATION_ANNOTATION_KEY] = str(
483
+ maxRunDurationSeconds
484
+ if maxRunDurationSeconds is not None
485
+ else _DEFAULT_MAX_RUN_DURATION_SECONDS
366
486
  )
367
487
 
368
488
  # Inject resource labels into JobSet metadata.
369
- if task.resources and task.resources.labels:
370
- jobset_spec['jobset']['metadata']['labels'].update(task.resources.labels)
489
+ if labels:
490
+ jobset_spec['jobset']['metadata']['labels'].update(labels)
371
491
 
372
492
 
373
493
  def merge_pod_into_jobset_template(