konduktor-nightly 0.1.0.dev20251030104830__tar.gz → 0.1.0.dev20251101104430__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of konduktor-nightly might be problematic. Click here for more details.

Files changed (106) hide show
  1. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/PKG-INFO +1 -1
  2. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/backends/constants.py +1 -0
  4. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/backends/deployment.py +13 -2
  5. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/backends/deployment_utils.py +3 -3
  6. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/backends/jobset_utils.py +2 -1
  7. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/backends/pod_utils.py +133 -18
  8. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/cli.py +16 -6
  9. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/manifests/aibrix-setup.yaml +157 -1
  10. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/manifests/apoxy-setup2.yaml +1 -1
  11. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/templates/deployment.yaml.j2 +5 -3
  12. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/templates/pod.yaml.j2 +119 -9
  13. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/base64_utils.py +2 -0
  14. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/pyproject.toml +1 -1
  15. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/LICENSE +0 -0
  16. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/README.md +0 -0
  17. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/adaptors/__init__.py +0 -0
  18. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/adaptors/aws.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/adaptors/common.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/adaptors/gcp.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/authentication.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/backends/__init__.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/backends/backend.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/backends/jobset.py +0 -0
  25. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/check.py +0 -0
  26. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/config.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/constants.py +0 -0
  28. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/controller/__init__.py +0 -0
  29. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/controller/constants.py +0 -0
  30. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/controller/launch.py +0 -0
  31. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/controller/node.py +0 -0
  32. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/controller/parse.py +0 -0
  33. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/README.md +0 -0
  34. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/backend/main.py +0 -0
  35. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/backend/sockets.py +0 -0
  36. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  37. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/.gitignore +0 -0
  38. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  39. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  40. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  41. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  42. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  43. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  44. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  45. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  46. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  47. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  48. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  49. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  50. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  51. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  52. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  53. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/globals.css +0 -0
  54. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  55. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/layout.js +0 -0
  56. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  57. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/app/page.js +0 -0
  58. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  59. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  60. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/package-lock.json +0 -0
  61. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/package.json +0 -0
  62. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  63. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/server.js +0 -0
  64. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  65. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/data/__init__.py +0 -0
  66. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/data/aws/__init__.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/data/aws/s3.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/data/constants.py +0 -0
  69. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/data/data_utils.py +0 -0
  70. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/data/gcp/__init__.py +0 -0
  71. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/data/gcp/constants.py +0 -0
  72. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/data/gcp/gcs.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/data/gcp/utils.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/data/registry.py +0 -0
  75. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/data/storage.py +0 -0
  76. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/data/storage_utils.py +0 -0
  77. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/execution.py +0 -0
  78. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/kube_client.py +0 -0
  79. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/logging.py +0 -0
  80. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/manifests/apoxy-setup.yaml +0 -0
  81. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/manifests/controller_deployment.yaml +0 -0
  82. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  83. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  84. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  85. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/resource.py +0 -0
  86. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/serving.py +0 -0
  87. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/task.py +0 -0
  88. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/templates/jobset.yaml.j2 +0 -0
  89. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/usage/__init__.py +0 -0
  90. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/usage/constants.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/__init__.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/accelerator_registry.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/annotations.py +0 -0
  94. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/common_utils.py +0 -0
  95. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/constants.py +0 -0
  96. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/env_options.py +0 -0
  97. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/exceptions.py +0 -0
  98. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/kubernetes_enums.py +0 -0
  99. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/kubernetes_utils.py +0 -0
  100. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/log_utils.py +0 -0
  101. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/loki_utils.py +0 -0
  102. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/rich_utils.py +0 -0
  103. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/schemas.py +0 -0
  104. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/subprocess_utils.py +0 -0
  105. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/ux_utils.py +0 -0
  106. {konduktor_nightly-0.1.0.dev20251030104830 → konduktor_nightly-0.1.0.dev20251101104430}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20251030104830
3
+ Version: 0.1.0.dev20251101104430
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = '2d0baca8b432c156afdb9cc49add78305fe0ed21'
14
+ _KONDUKTOR_COMMIT_SHA = 'd5fddf4e144c4887227e1c6943c70bcd72d364d5'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20251030104830'
48
+ __version__ = '1.0.0.dev0.1.0.dev20251101104430'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -18,3 +18,4 @@ STOP_USERNAME_LABEL = 'trainy.ai/stop-username'
18
18
  SECRET_BASENAME_LABEL = 'trainy.ai/secret-basename'
19
19
  SECRET_KIND_LABEL = 'trainy.ai/secret-kind'
20
20
  SECRET_OWNER_LABEL = 'trainy.ai/secret-owner'
21
+ ROOT_NAME = 'trainy.ai/root-name'
@@ -54,8 +54,19 @@ def _wait_for_all_ready(namespace: str, name: str):
54
54
  except ApiException:
55
55
  services_map = {}
56
56
 
57
- autoscaler = deployment_utils.get_autoscaler(namespace, name)
58
- autoscalers_map = {name: autoscaler} if autoscaler else {}
57
+ autoscalers_map = {}
58
+ try:
59
+ autoscaler_obj = deployment_utils.get_autoscaler(namespace, name)
60
+ if autoscaler_obj:
61
+ # detect aibrix vs general from deployment labels
62
+ labels = (deployment.metadata.labels or {}) if deployment else {}
63
+ is_aibrix = deployment_utils.AIBRIX_NAME_LABEL in labels
64
+ if is_aibrix:
65
+ autoscalers_map[name] = {'kpa': autoscaler_obj}
66
+ else:
67
+ autoscalers_map[name] = {'hpa': autoscaler_obj}
68
+ except ApiException:
69
+ pass
59
70
 
60
71
  status = deployment_utils.get_model_status(
61
72
  name, deployments_map, services_map, autoscalers_map
@@ -998,13 +998,13 @@ def get_envoy_external_ip() -> Optional[str]:
998
998
 
999
999
 
1000
1000
  def get_ingress_nginx_external_ip() -> Optional[str]:
1001
- """Get the external IP of the ingress-nginx-controller LoadBalancer."""
1001
+ """Get the external IP of the keda-ingress-nginx-controller LoadBalancer."""
1002
1002
  context = kubernetes_utils.get_current_kube_config_context_name()
1003
1003
  core_api = kube_client.core_api(context=context)
1004
1004
  try:
1005
- # Look for ingress-nginx-controller service in keda namespace
1005
+ # Look for keda-ingress-nginx-controller service in keda namespace
1006
1006
  service = core_api.read_namespaced_service(
1007
- name='ingress-nginx-controller', namespace='keda'
1007
+ name='keda-ingress-nginx-controller', namespace='keda'
1008
1008
  )
1009
1009
  if service.spec.type == 'LoadBalancer':
1010
1010
  ingress = service.status.load_balancer.ingress
@@ -449,7 +449,8 @@ def _format_timestamp(timestamp: str) -> str:
449
449
 
450
450
 
451
451
  def _get_job_start_time(job: Dict[str, Any]) -> str:
452
- for condition in job['status'].get('conditions', []):
452
+ status = job.get('status', {})
453
+ for condition in status.get('conditions', []):
453
454
  if condition['reason'] == 'ResumeJobs':
454
455
  return condition.get('lastTransitionTime', '')
455
456
  return '-'
@@ -153,7 +153,9 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
153
153
  git_ssh_secret_name = None
154
154
  env_secret_envs = []
155
155
  default_secrets = []
156
+ basename_by_k8s: Dict[str, str] = {}
156
157
 
158
+ # only get own secrets
157
159
  user_hash = common_utils.get_user_hash()
158
160
  label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
159
161
  user_secrets = kubernetes_utils.list_secrets(
@@ -162,19 +164,36 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
162
164
 
163
165
  for secret in user_secrets:
164
166
  kind = kubernetes_utils.get_secret_kind(secret)
167
+
168
+ # incase the user modified their secret to have no key:value data
169
+ if secret.data is None:
170
+ secret.data = {}
171
+
172
+ # fill the map for *all* secrets we see
173
+ k8s_name = secret.metadata.name
174
+ lbls = secret.metadata.labels or {}
175
+ base = lbls.get(
176
+ backend_constants.SECRET_BASENAME_LABEL,
177
+ # fallback: strip trailing "-<something>" once if present
178
+ k8s_name.rsplit('-', 1)[0] if '-' in k8s_name else k8s_name,
179
+ )
180
+ basename_by_k8s[k8s_name] = base
181
+
165
182
  if kind == 'git-ssh' and git_ssh_secret_name is None:
166
183
  git_ssh_secret_name = secret.metadata.name
167
184
  elif kind == 'env':
168
185
  env_secret_name = secret.metadata.name
169
- key = next(iter(secret.data))
170
- env_secret_envs.append(
171
- {
172
- 'name': key,
173
- 'valueFrom': {
174
- 'secretKeyRef': {'name': env_secret_name, 'key': key}
175
- },
176
- }
177
- )
186
+ # iterate ALL keys, not just one (ex. if user made a multi-key env secret)
187
+ for key, _ in secret.data.items():
188
+ # wire the env var to read its value from a k8s secret
189
+ env_secret_envs.append(
190
+ {
191
+ 'name': key,
192
+ 'valueFrom': {
193
+ 'secretKeyRef': {'name': env_secret_name, 'key': key}
194
+ },
195
+ }
196
+ )
178
197
  elif kind == 'default':
179
198
  default_secret_name = secret.metadata.name
180
199
  basename = secret.metadata.labels.get(
@@ -184,6 +203,22 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
184
203
  {'k8s_name': default_secret_name, 'mount_name': basename}
185
204
  )
186
205
 
206
+ # Check if the task references KONDUKTOR_DEFAULT_SECRETS and that it exists
207
+ uses_default_secret_var = (
208
+ 'KONDUKTOR_DEFAULT_SECRETS' in (task.run or '')
209
+ or 'KONDUKTOR_DEFAULT_SECRETS' in (task.setup or '')
210
+ or '/konduktor/default-secrets/' in (task.run or '')
211
+ or '/konduktor/default-secrets/' in (task.setup or '')
212
+ )
213
+ if uses_default_secret_var and not default_secrets:
214
+ raise exceptions.MissingSecretError(
215
+ f'Task references KONDUKTOR_DEFAULT_SECRETS or '
216
+ f'/konduktor/default-secrets but '
217
+ f'user {common_utils.get_cleaned_username()} '
218
+ f'has no default secrets. Paths like '
219
+ f'$KONDUKTOR_DEFAULT_SECRETS/<secret_name>/... will not exist.'
220
+ )
221
+
187
222
  # Inject --served-model-name, --host, and --port into serving run command
188
223
  if task.serving and task.run and 'vllm.entrypoints.openai.api_server' in task.run:
189
224
  if '--served-model-name' and '--host' and '--port' not in task.run:
@@ -262,31 +297,111 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
262
297
  },
263
298
  temp.name,
264
299
  )
300
+
301
+ # Capture the template env names BEFORE user config is merged
302
+ pod_config_template = common_utils.read_yaml(temp.name)
303
+ tmpl_envs = pod_config_template['kubernetes']['pod_config']['spec'][
304
+ 'containers'
305
+ ][0].get('env', [])
306
+ tmpl_env_names = {e['name'] for e in tmpl_envs}
307
+
265
308
  pod_config = common_utils.read_yaml(temp.name)
266
- # merge with `~/.konduktor/config.yaml``
309
+ # merge with `~/.konduktor/config.yaml`` (config.yaml overrides template)
267
310
  kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
268
311
  pod_config = common_utils.read_yaml(temp.name)
269
312
 
270
- # Priority order: task.envs > secret envs > existing pod_config envs
271
- existing_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
313
+ # Find what came from user config (appeared after combine, not in template)
314
+ premerge_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
272
315
  'env', []
273
316
  )
274
- env_map = {env['name']: env for env in existing_envs}
317
+ premerge_names = {e['name'] for e in premerge_envs}
318
+ config_env_names0 = premerge_names - tmpl_env_names
275
319
 
276
- # Inject secret envs
320
+ # Build final env list
321
+ env_map = {env['name']: env for env in premerge_envs}
322
+
323
+ # Inject secret envs (env secrets override config.yaml)
277
324
  for env in env_secret_envs:
278
325
  env_map[env['name']] = env
279
326
 
280
- # Inject task.envs
327
+ # Inject task envs
328
+ # CLI+task.yaml overrides everything else
329
+ # CLI already overrode task.yaml in other code
281
330
  for k, v in task.envs.items():
282
331
  env_map[k] = {'name': k, 'value': v}
283
332
 
284
- # Replace the container's env section with the merged and prioritized map
285
- pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = list(
286
- env_map.values()
333
+ final_envs_list = list(env_map.values())
334
+ pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = (
335
+ final_envs_list
287
336
  )
337
+ container = pod_config['kubernetes']['pod_config']['spec']['containers'][0]
338
+ final_envs = container['env']
339
+ final_names = {e['name'] for e in final_envs}
340
+
288
341
  logger.debug(f'rendered pod spec: \n\t{json.dumps(pod_config, indent=2)}')
289
342
 
343
+ # 1) Get secret envs actually used in the final env list
344
+ secret_details = sorted(
345
+ (e['name'], e['valueFrom']['secretKeyRef']['name'])
346
+ for e in final_envs
347
+ if isinstance(e, dict)
348
+ and e.get('valueFrom', {})
349
+ and e['valueFrom'].get('secretKeyRef')
350
+ )
351
+ secret_names = [n for n, _ in secret_details]
352
+
353
+ # 2) Get task-sourced (CLI+task.yaml) envs actually used in the final env list
354
+ task_all_names = sorted(
355
+ n
356
+ for n in (task.envs or {}).keys()
357
+ if n in final_names and n not in secret_names
358
+ )
359
+
360
+ # 3) Get Config.yaml envs actually used in the final env list
361
+ config_names = sorted(
362
+ n
363
+ for n in config_env_names0
364
+ if n in final_names and n not in secret_names and n not in task_all_names
365
+ )
366
+
367
+ # 4) Get other envs (template/system) actually used in the final env list
368
+ other_names = sorted(
369
+ final_names - set(secret_names) - set(task_all_names) - set(config_names)
370
+ )
371
+
372
+ # Export helper envs for the startup script (names only)
373
+ def _append_helper(name: str, values):
374
+ container['env'].append({'name': name, 'value': ','.join(values)})
375
+
376
+ # to show user basenames of k8s secrets instead of actual
377
+ # k8s secret names (which have added suffixes)
378
+ secret_map_pairs = [
379
+ f'{var}={basename_by_k8s.get(secret_k8s, secret_k8s)}'
380
+ for (var, secret_k8s) in secret_details
381
+ ]
382
+
383
+ # Priority order: CLI > task.yaml > env secret > config > template/system
384
+ _append_helper(
385
+ 'KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION',
386
+ secret_names,
387
+ )
388
+ _append_helper(
389
+ 'KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION',
390
+ secret_map_pairs,
391
+ )
392
+ _append_helper(
393
+ 'KONDUKTOR_ENV_TASK_ALL_HOPEFULLY_NO_NAME_COLLISION',
394
+ task_all_names,
395
+ )
396
+ _append_helper(
397
+ 'KONDUKTOR_ENV_CONFIG_HOPEFULLY_NO_NAME_COLLISION',
398
+ config_names,
399
+ )
400
+ _append_helper(
401
+ 'KONDUKTOR_ENV_OTHER_HOPEFULLY_NO_NAME_COLLISION',
402
+ other_names,
403
+ )
404
+
290
405
  # validate pod spec using json schema
291
406
  try:
292
407
  validator.validate_pod_spec(pod_config['kubernetes']['pod_config']['spec'])
@@ -54,6 +54,7 @@ from konduktor import logging
54
54
  from konduktor.backends import constants as backend_constants
55
55
  from konduktor.backends import deployment_utils, jobset_utils
56
56
  from konduktor.utils import (
57
+ base64_utils,
57
58
  common_utils,
58
59
  kubernetes_utils,
59
60
  log_utils,
@@ -1491,12 +1492,21 @@ def create(kind, from_file, from_directory, inline, name):
1491
1492
  data = {}
1492
1493
  if from_directory:
1493
1494
  click.echo(f'Creating secret from directory: {from_directory}')
1494
- base_path = pathlib.Path(from_directory)
1495
- for path in base_path.rglob('*'):
1496
- if path.is_file():
1497
- rel_path = path.relative_to(base_path)
1498
- with open(path, 'rb') as f:
1499
- data[str(rel_path)] = b64encode(f.read()).decode()
1495
+ # Use ABSOLUTE directory path so the top-level folder name is preserved
1496
+ base_dir_abs = os.path.abspath(os.path.expanduser(from_directory))
1497
+ if not os.path.isdir(base_dir_abs):
1498
+ raise click.BadParameter(
1499
+ f"--from-directory {from_directory} doesn't exist or is not a directory"
1500
+ )
1501
+ # Ensure there is at least one file inside
1502
+ if not any(p.is_file() for p in pathlib.Path(base_dir_abs).rglob('*')):
1503
+ raise click.BadParameter(f'--from-directory {from_directory} is empty.')
1504
+
1505
+ # Zip + base64 the WHOLE directory (this preserves the inner structure)
1506
+ archive_b64 = base64_utils.zip_base64encode([base_dir_abs])
1507
+
1508
+ # Store as a single key; pod will unzip to the expanded path
1509
+ data = {'payload.zip': archive_b64}
1500
1510
  elif from_file:
1501
1511
  click.echo(f'Creating secret from file: {from_file}')
1502
1512
  key = os.path.basename(from_file)
@@ -34,6 +34,34 @@ metadata:
34
34
  name: aibrix-activator
35
35
  ---
36
36
  apiVersion: v1
37
+ kind: ServiceAccount
38
+ metadata:
39
+ name: aibrix-activator
40
+ namespace: aibrix-activator
41
+ ---
42
+ apiVersion: rbac.authorization.k8s.io/v1
43
+ kind: ClusterRole
44
+ metadata:
45
+ name: aibrix-activator
46
+ rules:
47
+ - apiGroups: ["apps"]
48
+ resources: ["deployments"]
49
+ verbs: ["get", "list", "watch"]
50
+ ---
51
+ apiVersion: rbac.authorization.k8s.io/v1
52
+ kind: ClusterRoleBinding
53
+ metadata:
54
+ name: aibrix-activator
55
+ roleRef:
56
+ apiGroup: rbac.authorization.k8s.io
57
+ kind: ClusterRole
58
+ name: aibrix-activator
59
+ subjects:
60
+ - kind: ServiceAccount
61
+ name: aibrix-activator
62
+ namespace: aibrix-activator
63
+ ---
64
+ apiVersion: v1
37
65
  kind: ConfigMap
38
66
  metadata:
39
67
  name: activator-code
@@ -44,16 +72,26 @@ data:
44
72
  from collections import defaultdict, deque
45
73
  from fastapi import FastAPI, Request
46
74
  from fastapi.responses import PlainTextResponse, JSONResponse
75
+ import asyncio
76
+ from kubernetes import client, config
47
77
 
48
78
  NAMESPACE = os.getenv("NAMESPACE", "default")
49
79
  WINDOW_SEC = int(os.getenv("WINDOW_SEC", "30")) # demand lookback
50
80
  CAPACITY_RPS = float(os.getenv("CAPACITY_RPS", "1.0")) # per-replica capacity
51
81
  MIN_WAKE = int(os.getenv("MIN_REPLICA_ON_WAKE", "1"))
52
82
  MAX_REPLICAS = int(os.getenv("MAX_REPLICAS", "8"))
83
+ CLEANUP_INTERVAL = int(os.getenv("CLEANUP_INTERVAL", "300")) # 5 minutes
53
84
 
54
85
  app = FastAPI()
55
86
  events = defaultdict(deque) # key=(ns,model) -> deque[timestamps]
56
87
 
88
+ # Initialize Kubernetes client
89
+ try:
90
+ config.load_incluster_config()
91
+ k8s_apps_v1 = client.AppsV1Api()
92
+ except:
93
+ k8s_apps_v1 = None
94
+
57
95
  def _prune(q, now):
58
96
  while q and now - q[0] > WINDOW_SEC: q.popleft()
59
97
 
@@ -89,6 +127,48 @@ data:
89
127
  pass
90
128
  return None
91
129
 
130
+ def _get_existing_deployments():
131
+ """Get list of existing Aibrix deployments from Kubernetes"""
132
+ if not k8s_apps_v1:
133
+ return set()
134
+ try:
135
+ deployments = k8s_apps_v1.list_namespaced_deployment(
136
+ namespace=NAMESPACE,
137
+ label_selector="model.aibrix.ai/name"
138
+ )
139
+ return {d.metadata.name for d in deployments.items}
140
+ except Exception:
141
+ return set()
142
+
143
+ def _cleanup_stale_entries():
144
+ """Remove entries for deployments that no longer exist"""
145
+ if not k8s_apps_v1:
146
+ return
147
+ try:
148
+ existing_deployments = _get_existing_deployments()
149
+ # Remove entries for deployments that no longer exist
150
+ keys_to_remove = []
151
+ for (ns, model) in list(events.keys()):
152
+ if ns == NAMESPACE and model not in existing_deployments:
153
+ keys_to_remove.append((ns, model))
154
+
155
+ for key in keys_to_remove:
156
+ del events[key]
157
+ print(f"Cleaned up stale entry for deployment: {key[1]}")
158
+ except Exception as e:
159
+ print(f"Error during cleanup: {e}")
160
+
161
+ async def _cleanup_task():
162
+ """Background task to periodically clean up stale entries"""
163
+ while True:
164
+ await asyncio.sleep(CLEANUP_INTERVAL)
165
+ _cleanup_stale_entries()
166
+
167
+ @app.on_event("startup")
168
+ async def startup_event():
169
+ """Start background cleanup task"""
170
+ asyncio.create_task(_cleanup_task())
171
+
92
172
  # Mirror endpoints (same as your API paths); quick 204 response
93
173
  @app.post("/v1/completions")
94
174
  @app.post("/v1/chat/completions")
@@ -108,6 +188,37 @@ data:
108
188
  _bump(NAMESPACE, model)
109
189
  return JSONResponse({"ok": True}, status_code=204)
110
190
 
191
+ # Prometheus-friendly aggregate endpoint: export ALL (ns, model)
192
+ @app.get("/metrics", response_class=PlainTextResponse)
193
+ async def metrics_all():
194
+ lines = []
195
+ # Idiomatic names
196
+ lines.append("# HELP vllm_deployment_replicas Number of suggested replicas.")
197
+ lines.append("# TYPE vllm_deployment_replicas gauge")
198
+ lines.append("# HELP vllm_observed_rps Incoming requests per second.")
199
+ lines.append("# TYPE vllm_observed_rps gauge")
200
+ now = time.time()
201
+ for (ns, model), q in list(events.items()):
202
+ _prune(q, now)
203
+ rps = len(q) / max(WINDOW_SEC, 1)
204
+ d = _desired(ns, model)
205
+ lines.append(f'vllm_deployment_replicas{{namespace="{ns}",model_name="{model}"}} {d}')
206
+ lines.append(f'vllm_observed_rps{{namespace="{ns}",model_name="{model}"}} {rps:.6f}')
207
+ # (Optional) keep legacy names with colons for back-compat
208
+ lines.append("# HELP vllm:deployment_replicas Number of suggested replicas.")
209
+ lines.append("# TYPE vllm:deployment_replicas gauge")
210
+ lines.append("# HELP vllm:observed_rps Incoming requests per second.")
211
+ lines.append("# TYPE vllm:observed_rps gauge")
212
+ now = time.time()
213
+ for (ns, model), q in list(events.items()):
214
+ _prune(q, now)
215
+ rps = len(q) / max(WINDOW_SEC, 1)
216
+ d = _desired(ns, model)
217
+ lines.append(f'vllm:deployment_replicas{{namespace="{ns}",model_name="{model}"}} {d}')
218
+ lines.append(f'vllm:observed_rps{{namespace="{ns}",model_name="{model}"}} {rps:.6f}')
219
+ return "\n".join(lines) + "\n"
220
+
221
+
111
222
  # Metrics for KPA and Debugging
112
223
  @app.get("/metrics/{ns}/{model}", response_class=PlainTextResponse)
113
224
  async def metrics(ns: str, model: str):
@@ -142,7 +253,7 @@ spec:
142
253
  command: ["bash","-lc"]
143
254
  args:
144
255
  - |
145
- pip install fastapi uvicorn >/dev/null && \
256
+ pip install fastapi uvicorn kubernetes >/dev/null && \
146
257
  uvicorn activator:app --host 0.0.0.0 --port 8080
147
258
  env:
148
259
  - { name: NAMESPACE, value: "default" }
@@ -150,10 +261,12 @@ spec:
150
261
  - { name: CAPACITY_RPS, value: "1.0" }
151
262
  - { name: MIN_REPLICA_ON_WAKE, value: "1" }
152
263
  - { name: MAX_REPLICAS, value: "8" }
264
+ - { name: CLEANUP_INTERVAL, value: "300" }
153
265
  ports: [{containerPort: 8080}]
154
266
  volumeMounts:
155
267
  - { name: code, mountPath: /app/activator.py, subPath: activator.py }
156
268
  workingDir: /app
269
+ serviceAccountName: aibrix-activator
157
270
  volumes:
158
271
  - name: code
159
272
  configMap: { name: activator-code }
@@ -163,6 +276,13 @@ kind: Service
163
276
  metadata:
164
277
  name: aibrix-activator
165
278
  namespace: aibrix-activator
279
+ annotations:
280
+ prometheus.io/scrape: "true"
281
+ prometheus.io/port: "8080"
282
+ prometheus.io/path: "/metrics"
283
+ labels:
284
+ app: aibrix-activator
285
+ prometheus-discovery: "true"
166
286
  spec:
167
287
  selector: { app: aibrix-activator }
168
288
  ports:
@@ -172,6 +292,42 @@ spec:
172
292
  protocol: TCP
173
293
  type: ClusterIP
174
294
  ---
295
+ apiVersion: monitoring.coreos.com/v1
296
+ kind: ServiceMonitor
297
+ metadata:
298
+ name: aibrix-activator
299
+ namespace: prometheus
300
+ labels:
301
+ app: aibrix-activator
302
+ spec:
303
+ selector:
304
+ matchLabels:
305
+ app: aibrix-activator
306
+ namespaceSelector:
307
+ matchNames:
308
+ - aibrix-activator
309
+ endpoints:
310
+ - port: http
311
+ path: /metrics
312
+ ---
313
+ apiVersion: monitoring.coreos.com/v1
314
+ kind: ServiceMonitor
315
+ metadata:
316
+ name: vllm-deployments
317
+ namespace: prometheus
318
+ labels:
319
+ app: vllm-deployments
320
+ spec:
321
+ selector:
322
+ matchLabels:
323
+ prometheus-discovery: "true"
324
+ namespaceSelector:
325
+ matchNames:
326
+ - default
327
+ endpoints:
328
+ - port: serve
329
+ path: /metrics
330
+ ---
175
331
  apiVersion: gateway.networking.k8s.io/v1beta1
176
332
  kind: ReferenceGrant
177
333
  metadata:
@@ -59,7 +59,7 @@ metadata:
59
59
  name: UNIQUE-TEMPNAME-backend2
60
60
  spec:
61
61
  endpoints:
62
- - fqdn: ingress-nginx-controller.keda.UNIQUE-TEMPNAME.tun.apoxy.net
62
+ - fqdn: keda-ingress-nginx-controller.keda.UNIQUE-TEMPNAME.tun.apoxy.net
63
63
  ---
64
64
  # HTTPRoute for general deployments
65
65
  apiVersion: gateway.apoxy.dev/v1
@@ -41,11 +41,9 @@ metadata:
41
41
  {{ deployment_name_label }}: "{{ name }}"
42
42
  {{ deployment_user_label }}: "{{ user }}"
43
43
  trainy.ai/has-autoscaler: "{{ autoscaler }}"
44
- {% if not general %}
45
44
  annotations:
46
45
  prometheus.io/scrape: "true"
47
- prometheus.io/port: "8080"
48
- {% endif %}
46
+ prometheus.io/port: "9000"
49
47
  name: {{ name }}
50
48
  namespace: default
51
49
  spec:
@@ -142,6 +140,10 @@ apiVersion: networking.k8s.io/v1
142
140
  kind: Ingress
143
141
  metadata:
144
142
  name: {{ name }}-ingress
143
+ labels:
144
+ {{ deployment_name_label }}: "{{ name }}"
145
+ {{ deployment_user_label }}: "{{ user }}"
146
+ trainy.ai/konduktor-managed: "true"
145
147
  annotations:
146
148
  nginx.ingress.kubernetes.io/use-regex: "true"
147
149
  nginx.ingress.kubernetes.io/rewrite-target: /$1
@@ -28,16 +28,21 @@ kubernetes:
28
28
  containers:
29
29
  # TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
30
30
  - name: konduktor-container
31
- {% if enable_ssh %}
31
+ {% if enable_ssh or serving %}
32
32
  ports:
33
+ {% if enable_ssh %}
33
34
  - name: ssh
34
35
  containerPort: {{ konduktor_ssh_port }}
36
+ {% endif %}
37
+
38
+ {% if serving %}
39
+ - name: serving
40
+ containerPort: {{ ports }}
41
+ {% endif %}
35
42
  {% endif %}
36
- {% if serving %}
37
- ports:
38
- - containerPort: {{ ports }}
43
+
44
+ {% if serving and probe %}
39
45
  # TODO (ryan): allow modification of thresholds and timings
40
- {% if probe %}
41
46
  livenessProbe:
42
47
  httpGet:
43
48
  path: {{ probe }}
@@ -68,7 +73,6 @@ kubernetes:
68
73
  successThreshold: 1
69
74
  timeoutSeconds: 1
70
75
  {% endif %}
71
- {% endif %}
72
76
  image: {{ image_id }}
73
77
  # this is set during jobset definition since we need to know the jobset
74
78
  # name and number of nodes to set all the environment variables correctly here
@@ -134,6 +138,8 @@ kubernetes:
134
138
  {% if default_secrets %}
135
139
  - name: KONDUKTOR_DEFAULT_SECRETS
136
140
  value: "/konduktor/default-secrets"
141
+ - name: KONDUKTOR_DEFAULT_SECRETS_EXPANDED
142
+ value: "/run/konduktor/expanded-default-secrets"
137
143
  {% endif %}
138
144
  # these are for compatibility with skypilot
139
145
  - name: SKYPILOT_NODE_IPS
@@ -146,6 +152,10 @@ kubernetes:
146
152
  value: "{{ num_nodes }}"
147
153
  - name: SKYPILOT_NUM_GPUS_PER_NODE
148
154
  value: "{{ num_gpus }}"
155
+ - name: RESTART_ATTEMPT
156
+ valueFrom:
157
+ fieldRef:
158
+ fieldPath: metadata.labels['jobset.sigs.k8s.io/restart-attempt']
149
159
  volumeMounts:
150
160
  - name: shared-memory
151
161
  mountPath: /dev/shm
@@ -159,6 +169,10 @@ kubernetes:
159
169
  - name: default-secret-{{ secret.mount_name }}
160
170
  mountPath: /konduktor/default-secrets/{{ secret.mount_name }}
161
171
  {% endfor %}
172
+ {% if default_secrets %}
173
+ - name: default-secrets-expanded
174
+ mountPath: /run/konduktor/expanded-default-secrets
175
+ {% endif %}
162
176
  {% if git_ssh %}
163
177
  - name: git-ssh-secret
164
178
  mountPath: /run/konduktor/git-ssh-secret
@@ -192,7 +206,7 @@ kubernetes:
192
206
  {% if 'curl' in run_cmd or 'curl' in setup_cmd or tailscale_secret %}
193
207
  PACKAGES="$PACKAGES curl";
194
208
  {% endif %}
195
- {% if 'gs' in mount_secrets or 's3' in mount_secrets %}
209
+ {% if 'gs' in mount_secrets or 's3' in mount_secrets or default_secrets %}
196
210
  PACKAGES="$PACKAGES unzip wget";
197
211
  {% endif %}
198
212
  {% if 'git' in run_cmd or 'git' in setup_cmd %}
@@ -231,7 +245,7 @@ kubernetes:
231
245
  fi;
232
246
  end_epoch=$(date +%s);
233
247
 
234
- echo "Exposing ENV variables"
248
+ echo "===== KONDUKTOR: Exposing ENV variables ====="
235
249
  $(prefix_cmd) env -0 | awk -v RS='\0' '
236
250
  {
237
251
  gsub(/\\/,"\\\\"); # escape existing backslashes first
@@ -346,8 +360,41 @@ kubernetes:
346
360
 
347
361
  $(prefix_cmd) echo "===== KONDUKTOR: Installing packages took $((end_epoch - start_epoch)) seconds ====="
348
362
 
363
+ $(prefix_cmd) echo "===== KONDUKTOR: Environment variable summary ====="
364
+ start_epoch=$(date +%s);
365
+
366
+ print_bucket () {
367
+ title="$1"; list="${2:-}"
368
+ echo "--- $title ---"
369
+ if [ -n "$list" ]; then
370
+ echo "$list" | tr ',' '\n' | sed "s/^/[$title] /"
371
+ else
372
+ echo "[none]"
373
+ fi
374
+ }
375
+
376
+ # Secrets: prefer detailed mapping if available
377
+ echo "--- env secret ---"
378
+ if [ -n "${KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION:-}" ]; then
379
+ echo "${KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION}" \
380
+ | tr ',' '\n' \
381
+ | awk -F'=' '{ printf("[secret: %s] %s\n", $2, $1) }'
382
+ elif [ -n "${KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION:-}" ]; then
383
+ echo "${KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION}" \
384
+ | tr ',' '\n' | sed 's/^/[secret] /'
385
+ else
386
+ echo "[none]"
387
+ fi
388
+
389
+ print_bucket "CLI + task.yaml" "${KONDUKTOR_ENV_TASK_ALL_HOPEFULLY_NO_NAME_COLLISION}"
390
+ print_bucket "config.yaml" "${KONDUKTOR_ENV_CONFIG_HOPEFULLY_NO_NAME_COLLISION}"
391
+ print_bucket "other" "${KONDUKTOR_ENV_OTHER_HOPEFULLY_NO_NAME_COLLISION}"
392
+
393
+ end_epoch=$(date +%s);
394
+ $(prefix_cmd) echo "===== KONDUKTOR: Environment variable summary took $((end_epoch - start_epoch)) seconds ====="
395
+
349
396
  # unpack secrets credentials
350
- $(prefix_cmd) echo "===== KONDUKTOR: Unpacking secrets credentials ====="
397
+ $(prefix_cmd) echo "===== KONDUKTOR: Unpacking cloud storage secret credentials ====="
351
398
  start_epoch=$(date +%s);
352
399
  mkdir -p ~/.konduktor
353
400
  mkdir -p {{ remote_workdir }}
@@ -362,12 +409,71 @@ kubernetes:
362
409
  $(prefix_cmd) unzip /run/konduktor/s3-secret/awscredentials -d ~/.aws
363
410
  {% endif %}
364
411
  {% endfor %}
412
+
413
+ {% if default_secrets %}
414
+ $(prefix_cmd) echo "===== KONDUKTOR: Unpacking default secrets ====="
415
+ $(prefix_cmd) mkdir -p "${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}"
416
+
417
+ # For each mounted default secret folder:
418
+ # - if payload.zip exists, unzip it into the expanded dir
419
+ # - otherwise, copy the files as-is
420
+ for src in "${KONDUKTOR_DEFAULT_SECRETS}"/*; do
421
+ [ -d "$src" ] || continue
422
+ name="$(basename "$src")"
423
+ dst="${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}/${name}"
424
+ $(prefix_cmd) mkdir -p "$dst"
425
+
426
+ if [ -f "${src}/payload.zip" ]; then
427
+ $(prefix_cmd) unzip -oq "${src}/payload.zip" -d "$dst"
428
+ else
429
+ $(prefix_cmd) cp -a "${src}/." "$dst/"
430
+ fi
431
+ done
432
+
433
+ # Point callers to the expanded (writable) path going forward
434
+ export KONDUKTOR_DEFAULT_SECRETS="${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}"
435
+ $(prefix_cmd) echo "KONDUKTOR_DEFAULT_SECRETS=${KONDUKTOR_DEFAULT_SECRETS_EXPANDED}" >> /etc/environment
436
+ {% endif %}
437
+
365
438
  {% if git_ssh %}
366
439
  $(prefix_cmd) echo "Unpacking GIT-SSH secret"
367
440
  {% endif %}
368
441
  end_epoch=$(date +%s);
369
442
  $(prefix_cmd) echo "===== KONDUKTOR: Unpacking secrets credentials took $((end_epoch - start_epoch)) seconds ====="
370
443
 
444
+ $(prefix_cmd) echo "===== KONDUKTOR: Default secret summary ====="
445
+ start_epoch=$(date +%s)
446
+
447
+ root="${KONDUKTOR_DEFAULT_SECRETS:-}"
448
+ if [[ -z "$root" || ! -d "$root" ]]; then
449
+ $(prefix_cmd) echo "NO DEFAULT SECRETS FOUND."
450
+ else
451
+ for dir in "$root"/*; do
452
+ [ -d "$dir" ] || continue
453
+ name="$(basename "$dir")"
454
+
455
+ # Pretty header that mirrors the logical mount base:
456
+ $(prefix_cmd) echo "/konduktor/default-secrets/${name}:"
457
+
458
+ # Print relative paths only; skip macOS junk and k8s secret internals
459
+ (
460
+ cd "$dir"
461
+ out="$(find . \
462
+ \( -name '.DS_Store' -o -name '__MACOSX' -o -name '..data' -o -name '..*' \) -prune -o \
463
+ \( -type f -o -type l \) -print \
464
+ | sed 's|^\./||' \
465
+ | sort)"
466
+ if [ -n "$out" ]; then
467
+ printf "%s\n" "$out"
468
+ fi
469
+ )
470
+ done
471
+ fi
472
+
473
+ end_epoch=$(date +%s)
474
+ $(prefix_cmd) echo "===== KONDUKTOR: Default secret summary took $((end_epoch - start_epoch)) seconds ====="
475
+
476
+
371
477
  # sync file mounts
372
478
  {% for mkdir_command in mkdir_commands %}
373
479
  $(prefix_cmd) {{ mkdir_command }}
@@ -436,6 +542,10 @@ kubernetes:
436
542
  secret:
437
543
  secretName: {{ secret.k8s_name }}
438
544
  {% endfor %}
545
+ {% if default_secrets %}
546
+ - name: default-secrets-expanded
547
+ emptyDir: {}
548
+ {% endif %}
439
549
  {% if git_ssh %}
440
550
  - name: git-ssh-secret
441
551
  secret:
@@ -44,6 +44,8 @@ def zip_base64encode(files: List[str]) -> str:
44
44
  else:
45
45
  for root, _, files in os.walk(item_path):
46
46
  for file in files:
47
+ if file == '.DS_Store':
48
+ continue
47
49
  file_path = os.path.join(root, file)
48
50
  arcname = os.path.relpath(file_path, temp_dir)
49
51
  zipf.write(file_path, arcname)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "konduktor-nightly"
3
- version = "0.1.0.dev20251030104830"
3
+ version = "0.1.0.dev20251101104430"
4
4
  description = "GPU Cluster Health Management"
5
5
  packages = [
6
6
  {include = "konduktor"}