konduktor-nightly 0.1.0.dev20250825104841__tar.gz → 0.1.0.dev20250827104553__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of konduktor-nightly might be problematic. Click here for more details.

Files changed (106) hide show
  1. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/PKG-INFO +1 -1
  2. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/backends/deployment.py +8 -0
  4. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/backends/deployment_utils.py +318 -41
  5. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/backends/jobset.py +3 -2
  6. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/backends/jobset_utils.py +8 -1
  7. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/cli.py +12 -2
  8. konduktor_nightly-0.1.0.dev20250827104553/konduktor/manifests/apoxy-setup.yaml +151 -0
  9. konduktor_nightly-0.1.0.dev20250827104553/konduktor/manifests/apoxy-setup2.yaml +34 -0
  10. konduktor_nightly-0.1.0.dev20250827104553/konduktor/templates/apoxy-deployment.yaml.j2 +33 -0
  11. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/schemas.py +14 -0
  12. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/pyproject.toml +1 -1
  13. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/LICENSE +0 -0
  14. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/README.md +0 -0
  15. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/adaptors/__init__.py +0 -0
  16. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/adaptors/aws.py +0 -0
  17. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/adaptors/common.py +0 -0
  18. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/adaptors/gcp.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/authentication.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/backends/__init__.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/backends/backend.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/backends/constants.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/backends/pod_utils.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/check.py +0 -0
  25. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/config.py +0 -0
  26. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/constants.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/controller/__init__.py +0 -0
  28. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/controller/constants.py +0 -0
  29. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/controller/launch.py +0 -0
  30. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/controller/node.py +0 -0
  31. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/controller/parse.py +0 -0
  32. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/README.md +0 -0
  33. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/backend/main.py +0 -0
  34. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/backend/sockets.py +0 -0
  35. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  36. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/.gitignore +0 -0
  37. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  38. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  39. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  40. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  41. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  42. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  43. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  44. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  45. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  46. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  47. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  48. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  49. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  50. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  51. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  52. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/globals.css +0 -0
  53. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  54. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/layout.js +0 -0
  55. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  56. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/app/page.js +0 -0
  57. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  58. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  59. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/package-lock.json +0 -0
  60. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/package.json +0 -0
  61. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  62. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/server.js +0 -0
  63. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  64. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/data/__init__.py +0 -0
  65. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/data/aws/__init__.py +0 -0
  66. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/data/aws/s3.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/data/constants.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/data/data_utils.py +0 -0
  69. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/data/gcp/__init__.py +0 -0
  70. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/data/gcp/constants.py +0 -0
  71. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/data/gcp/gcs.py +0 -0
  72. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/data/gcp/utils.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/data/registry.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/data/storage.py +0 -0
  75. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/data/storage_utils.py +0 -0
  76. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/execution.py +0 -0
  77. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/kube_client.py +0 -0
  78. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/logging.py +0 -0
  79. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/manifests/controller_deployment.yaml +0 -0
  80. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  81. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  82. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  83. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/resource.py +0 -0
  84. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/serving.py +0 -0
  85. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/task.py +0 -0
  86. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/templates/deployment.yaml.j2 +0 -0
  87. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/templates/jobset.yaml.j2 +0 -0
  88. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/templates/pod.yaml.j2 +0 -0
  89. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/usage/__init__.py +0 -0
  90. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/usage/constants.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/__init__.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/accelerator_registry.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/annotations.py +0 -0
  94. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/base64_utils.py +0 -0
  95. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/common_utils.py +0 -0
  96. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/constants.py +0 -0
  97. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/env_options.py +0 -0
  98. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/exceptions.py +0 -0
  99. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/kubernetes_enums.py +0 -0
  100. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/kubernetes_utils.py +0 -0
  101. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/log_utils.py +0 -0
  102. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/loki_utils.py +0 -0
  103. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/rich_utils.py +0 -0
  104. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/subprocess_utils.py +0 -0
  105. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/ux_utils.py +0 -0
  106. {konduktor_nightly-0.1.0.dev20250825104841 → konduktor_nightly-0.1.0.dev20250827104553}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250825104841
3
+ Version: 0.1.0.dev20250827104553
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = 'face26aca22b99192e740dca4875261fcffa2a55'
14
+ _KONDUKTOR_COMMIT_SHA = 'cb72c75ad328b535768794b5979a5ec56edb3d8e'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20250825104841'
48
+ __version__ = '1.0.0.dev0.1.0.dev20250827104553'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -163,6 +163,14 @@ class DeploymentBackend(backend.Backend):
163
163
  dryrun=dryrun,
164
164
  )
165
165
 
166
+ # Apoxy resources for general deployments only when endpoint: trainy
167
+ if deployment_utils.get_endpoint_type_from_config() == 'trainy':
168
+ deployment_utils.create_apoxy_resources(
169
+ namespace=namespace,
170
+ task=task,
171
+ dryrun=dryrun,
172
+ )
173
+
166
174
  if not dryrun and not detach_run:
167
175
  with ux_utils.print_exception_no_traceback():
168
176
  with rich_utils.safe_status(
@@ -2,6 +2,7 @@
2
2
 
3
3
  import json
4
4
  import os
5
+ import random
5
6
  import tempfile
6
7
  import typing
7
8
  from typing import Any, Dict, List, Optional, Tuple
@@ -14,6 +15,7 @@ from rich.table import Table
14
15
  from rich.text import Text
15
16
 
16
17
  import konduktor
18
+ from konduktor import config as konduktor_config
17
19
  from konduktor import kube_client, logging
18
20
  from konduktor.backends import constants as backend_constants
19
21
  from konduktor.backends import pod_utils
@@ -48,6 +50,32 @@ _DEPLOYMENT_METADATA_LABELS = {
48
50
  }
49
51
 
50
52
 
53
+ # actually just gets highest existing deployment number and adds 1
54
+ def get_next_deployment_number(cluster_name: str) -> int:
55
+ """Get next number by counting existing Apoxy resources."""
56
+ try:
57
+ context = kubernetes_utils.get_current_kube_config_context_name()
58
+ custom_api = kube_client.crd_api(context=context)
59
+
60
+ # Count existing backends
61
+ backends = custom_api.list_cluster_custom_object(
62
+ group='core.apoxy.dev', version='v1alpha', plural='backends'
63
+ )
64
+
65
+ # Find the highest number
66
+ max_number = 0
67
+ for backend in backends.get('items', []):
68
+ name = backend['metadata']['name']
69
+ if name.startswith(f'{cluster_name}-backend-'):
70
+ number = int(name.split('-')[-1])
71
+ max_number = max(max_number, number)
72
+
73
+ return max_number + 1
74
+ except Exception as e:
75
+ logger.warning(f'Error counting existing resources: {e}')
76
+ return random.randint(100, 999)
77
+
78
+
51
79
  def render_specs(
52
80
  task: 'konduktor.Task',
53
81
  ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]:
@@ -124,6 +152,130 @@ def render_specs(
124
152
  return deployment_spec, service_spec, autoscaler_spec or {}
125
153
 
126
154
 
155
+ # For general deployments, create resources as needed
156
+ def render_apoxy_spec(task: 'konduktor.Task') -> List[Dict[str, Any]]:
157
+ """Renders the Apoxy specs for a general deployment."""
158
+ general = True
159
+ if task.run and 'vllm.entrypoints.openai.api_server' in task.run:
160
+ general = False
161
+
162
+ if not general:
163
+ return [] # Only render for general deployments
164
+
165
+ if task.run:
166
+ task.run = task.run.replace('__KONDUKTOR_TASK_NAME__', task.name)
167
+
168
+ unique_cluster_name = get_unique_cluster_name_from_tunnel()
169
+ cluster_name = unique_cluster_name[:-3]
170
+ deployment_number = get_next_deployment_number(unique_cluster_name)
171
+
172
+ with tempfile.NamedTemporaryFile() as temp:
173
+ common_utils.fill_template(
174
+ 'apoxy-deployment.yaml.j2',
175
+ {
176
+ 'name': task.name,
177
+ 'user': common_utils.get_cleaned_username(),
178
+ 'ports': task.serving.ports if task.serving else 8000,
179
+ 'general': general,
180
+ 'cluster_name': cluster_name,
181
+ 'unique_cluster_name': unique_cluster_name,
182
+ 'deployment_number': deployment_number,
183
+ **_DEPLOYMENT_METADATA_LABELS,
184
+ },
185
+ temp.name,
186
+ )
187
+ docs = common_utils.read_yaml_all(temp.name)
188
+ return docs
189
+
190
+
191
+ def create_apoxy_resources(
192
+ namespace: str,
193
+ task: 'konduktor.Task',
194
+ dryrun: bool = False,
195
+ ) -> None:
196
+ """Creates Apoxy resources for a general deployment."""
197
+
198
+ apoxy_specs = render_apoxy_spec(task)
199
+
200
+ if not apoxy_specs:
201
+ return
202
+
203
+ if dryrun:
204
+ logger.debug(f'[DRYRUN] Would create Apoxy resources:\n{apoxy_specs}')
205
+ return
206
+
207
+ try:
208
+ context = kubernetes_utils.get_current_kube_config_context_name()
209
+ custom_api = kube_client.crd_api(context=context)
210
+
211
+ for spec in apoxy_specs:
212
+ kind = spec.get('kind')
213
+ name = spec['metadata']['name']
214
+
215
+ try:
216
+ if kind == 'Backend':
217
+ custom_api.create_cluster_custom_object(
218
+ group='core.apoxy.dev',
219
+ version='v1alpha',
220
+ plural='backends',
221
+ body=spec,
222
+ )
223
+ logger.info(f'Apoxy Backend {name} created')
224
+ elif kind == 'HTTPRoute':
225
+ custom_api.create_cluster_custom_object(
226
+ group='gateway.apoxy.dev',
227
+ version='v1',
228
+ plural='httproutes',
229
+ body=spec,
230
+ )
231
+ logger.info(f'Apoxy HTTPRoute {name} created')
232
+ except Exception as e:
233
+ if '409' in str(e) or 'AlreadyExists' in str(e):
234
+ try:
235
+ # Delete first, then create
236
+ if kind == 'Backend':
237
+ custom_api.delete_cluster_custom_object(
238
+ group='core.apoxy.dev',
239
+ version='v1alpha',
240
+ plural='backends',
241
+ name=name,
242
+ )
243
+ custom_api.create_cluster_custom_object(
244
+ group='core.apoxy.dev',
245
+ version='v1alpha',
246
+ plural='backends',
247
+ body=spec,
248
+ )
249
+ elif kind == 'HTTPRoute':
250
+ custom_api.delete_cluster_custom_object(
251
+ group='gateway.apoxy.dev',
252
+ version='v1',
253
+ plural='httproutes',
254
+ name=name,
255
+ )
256
+ custom_api.create_cluster_custom_object(
257
+ group='gateway.apoxy.dev',
258
+ version='v1',
259
+ plural='httproutes',
260
+ body=spec,
261
+ )
262
+ logger.info(f'Apoxy {kind} {name} deleted and recreated')
263
+ except Exception as delete_create_error:
264
+ logger.error(
265
+ f'Failed to delete and recreate {kind} {name}: '
266
+ f'{delete_create_error}'
267
+ )
268
+ raise
269
+ elif '404' in str(e) or 'NotFound' in str(e):
270
+ logger.warning(f'Apoxy CRD for {kind} not found. Skipping {name}.')
271
+ logger.info('Make sure Apoxy is deployed and CRDs are ready.')
272
+ continue
273
+ else:
274
+ raise
275
+ except Exception as e:
276
+ logger.error(f'Error creating Apoxy resources: {e}')
277
+
278
+
127
279
  def create_deployment(
128
280
  namespace: str,
129
281
  task: 'konduktor.Task',
@@ -576,7 +728,7 @@ def delete_serving_specs(name: str, namespace: str) -> None:
576
728
  delete_fn(namespace, name)
577
729
  logger.info(f'Deleted {kind}: {name}')
578
730
  except Exception as e:
579
- logger.error(f'Failed to delete {kind} {name}: {e}')
731
+ logger.debug(f'Failed to delete {kind} {name}: {e}')
580
732
 
581
733
 
582
734
  def _get_resource_summary(deployment) -> str:
@@ -627,7 +779,156 @@ def get_envoy_external_ip() -> Optional[str]:
627
779
  return None
628
780
 
629
781
 
630
- def show_status_table(namespace: str, all_users: bool):
782
+ def get_unique_cluster_name_from_tunnel() -> str:
783
+ """Get cluster name from the apoxy deployment command."""
784
+ try:
785
+ context = kubernetes_utils.get_current_kube_config_context_name()
786
+ apps_api = kube_client.apps_api(context=context)
787
+
788
+ # Get the apoxy deployment
789
+ deployment = apps_api.read_namespaced_deployment(
790
+ name='apoxy', namespace='default'
791
+ )
792
+
793
+ # Extract cluster name from the command
794
+ containers = deployment.spec.template.spec.containers
795
+ if containers and len(containers) > 0:
796
+ command = containers[0].command
797
+ if (
798
+ command
799
+ and len(command) >= 4
800
+ and command[1] == 'tunnel'
801
+ and command[2] == 'run'
802
+ ):
803
+ return command[3] # The cluster name is the 4th argument
804
+
805
+ logger.warning('Could not extract cluster name from apoxy deployment command')
806
+
807
+ except Exception as e:
808
+ logger.warning(f'Error getting cluster name from apoxy deployment: {e}')
809
+
810
+ return 'default'
811
+
812
+
813
+ def get_endpoint_type_from_config() -> str:
814
+ """Get the endpoint type from konduktor config.
815
+
816
+ Returns:
817
+ 'trainy' for Apoxy endpoints (default)
818
+ 'direct' for LoadBalancer IP endpoints
819
+ """
820
+ try:
821
+ # Use the proper config system that handles KONDUKTOR_CONFIG env var
822
+ endpoint_type = konduktor_config.get_nested(('serving', 'endpoint'), 'trainy')
823
+ return endpoint_type.lower()
824
+ except Exception as e:
825
+ logger.warning(f'Error reading endpoint config: {e}')
826
+
827
+ # Default to trainy if config not found or error
828
+ return 'trainy'
829
+
830
+
831
+ def _get_loadbalancer_endpoint_with_port(service_name: str) -> str:
832
+ """Helper function to get LoadBalancer endpoint with port."""
833
+ try:
834
+ context = kubernetes_utils.get_current_kube_config_context_name()
835
+ core_api = kube_client.core_api(context=context)
836
+
837
+ # Get the service
838
+ service = core_api.read_namespaced_service(
839
+ name=service_name, namespace='default'
840
+ )
841
+
842
+ # Check if it's LoadBalancer type
843
+ if service.spec.type == 'LoadBalancer':
844
+ ingress = service.status.load_balancer.ingress
845
+ if ingress and len(ingress) > 0:
846
+ ip = ingress[0].ip
847
+ if ip:
848
+ return f'{ip}:{service.spec.ports[0].port}'
849
+
850
+ # If not LoadBalancer or no IP, return pending
851
+ return '<pending>'
852
+
853
+ except Exception:
854
+ return '<pending>'
855
+
856
+
857
+ def get_vllm_deployment_endpoint(force_direct: bool = False) -> str:
858
+ """Get the endpoint for vLLM/Aibrix deployments based on config."""
859
+ if force_direct:
860
+ # Force direct endpoint display regardless of config
861
+ endpoint_type = 'direct'
862
+ else:
863
+ endpoint_type = get_endpoint_type_from_config()
864
+
865
+ if endpoint_type == 'direct':
866
+ try:
867
+ aibrix_endpoint = get_envoy_external_ip()
868
+ return aibrix_endpoint or '<pending>'
869
+ except Exception:
870
+ return '<pending>'
871
+ else:
872
+ try:
873
+ cluster_name = get_unique_cluster_name_from_tunnel()
874
+ return f'{cluster_name[:-3]}.trainy.us'
875
+ except Exception:
876
+ # Fallback to direct endpoint if trainy.us not available
877
+ try:
878
+ aibrix_endpoint = get_envoy_external_ip()
879
+ if aibrix_endpoint:
880
+ # Aibrix deployments route through Envoy Gateway on port 80
881
+ return f'{aibrix_endpoint}'
882
+ except Exception:
883
+ pass
884
+ return '<pending>'
885
+
886
+
887
+ def get_general_deployment_endpoint(
888
+ service_name: str, force_direct: bool = False
889
+ ) -> str:
890
+ """Get the endpoint for a general deployment based on config."""
891
+ if force_direct:
892
+ # Force direct endpoint display regardless of config
893
+ endpoint_type = 'direct'
894
+ else:
895
+ endpoint_type = get_endpoint_type_from_config()
896
+
897
+ if endpoint_type == 'direct':
898
+ # Use LoadBalancer IP with port
899
+ return _get_loadbalancer_endpoint_with_port(service_name)
900
+ else:
901
+ # Use Apoxy (trainy.us) - existing logic
902
+ try:
903
+ context = kubernetes_utils.get_current_kube_config_context_name()
904
+ custom_api = kube_client.crd_api(context=context)
905
+
906
+ # Query route with label selector using the original task name
907
+ routes = custom_api.list_cluster_custom_object(
908
+ group='gateway.apoxy.dev',
909
+ version='v1',
910
+ plural='httproutes',
911
+ label_selector=f'task_name={service_name}',
912
+ )
913
+
914
+ # Extract endpoint_name from the route labels
915
+ if routes.get('items') and len(routes['items']) > 0:
916
+ route = routes['items'][0] # Should only be one route with this label
917
+ labels = route.get('metadata', {}).get('labels', {})
918
+ endpoint_name = labels.get('endpoint_name')
919
+ if endpoint_name:
920
+ return endpoint_name
921
+
922
+ # Fallback if no route found - try direct LoadBalancer endpoint
923
+ return _get_loadbalancer_endpoint_with_port(service_name)
924
+
925
+ except Exception as e:
926
+ logger.warning(f'Endpoint error for general deployment {service_name}: {e}')
927
+ # Fallback to direct LoadBalancer endpoint on error
928
+ return _get_loadbalancer_endpoint_with_port(service_name)
929
+
930
+
931
+ def show_status_table(namespace: str, all_users: bool, force_direct: bool = False):
631
932
  """Display status of Konduktor Serve models."""
632
933
  context = kubernetes_utils.get_current_kube_config_context_name()
633
934
 
@@ -657,10 +958,12 @@ def show_status_table(namespace: str, all_users: bool):
657
958
  return
658
959
 
659
960
  Console().print()
660
- external_ip = get_envoy_external_ip()
661
961
  title = '[bold]KONDUKTOR SERVE[/bold]'
662
962
  is_ci = os.environ.get('CI') or os.environ.get('BUILDKITE')
663
963
 
964
+ # Get Aibrix endpoint once for all Aibrix deployments
965
+ aibrix_endpoint = get_vllm_deployment_endpoint(force_direct)
966
+
664
967
  table = Table(title=title, box=box.ASCII if is_ci else box.ROUNDED)
665
968
  if all_users:
666
969
  table.add_column('User', style='magenta', no_wrap=True)
@@ -743,48 +1046,22 @@ def show_status_table(namespace: str, all_users: bool):
743
1046
  else Text('PENDING', style='yellow')
744
1047
  )
745
1048
 
746
- # Type & endpoint
747
- ip_str = '<pending>'
748
- labels = (
749
- (
750
- deployment.metadata.labels
751
- if deployment and hasattr(deployment.metadata, 'labels')
752
- else {}
753
- )
754
- or (
755
- service.metadata.labels
756
- if service and hasattr(service.metadata, 'labels')
757
- else {}
758
- )
759
- or {}
760
- )
761
- if AIBRIX_NAME_LABEL in labels:
762
- ip_str = external_ip or '<pending>'
1049
+ # Extract labels from deployment, service, or fallback to empty dict
1050
+ labels = {}
1051
+ if deployment and hasattr(deployment.metadata, 'labels'):
1052
+ labels = deployment.metadata.labels or {}
1053
+ elif service and hasattr(service.metadata, 'labels'):
1054
+ labels = service.metadata.labels or {}
763
1055
  else:
764
- if (
765
- service
766
- and service.status
767
- and service.status.load_balancer
768
- and service.status.load_balancer.ingress
769
- ):
770
- ing = service.status.load_balancer.ingress[0]
771
- ip_str = ing.ip or ing.hostname or '<pending>'
772
-
773
- # Port
774
- port_str = ''
775
- if service and service.spec and service.spec.ports:
776
- port_obj = (
777
- next((p for p in service.spec.ports if p.name == 'serve'), None)
778
- or service.spec.ports[0]
779
- )
780
- if port_obj and port_obj.port:
781
- port_str = str(port_obj.port)
1056
+ labels = {}
782
1057
 
783
- # For vLLM deployments, don't append port since external routing is on port 80
1058
+ endpoint_str = '<pending>'
784
1059
  if AIBRIX_NAME_LABEL in labels:
785
- endpoint_str = ip_str
1060
+ # Aibrix deployment - use the pre-computed endpoint
1061
+ endpoint_str = aibrix_endpoint
786
1062
  else:
787
- endpoint_str = f'{ip_str}:{port_str}' if port_str else ip_str
1063
+ # General deployment
1064
+ endpoint_str = get_general_deployment_endpoint(name, force_direct)
788
1065
 
789
1066
  # Replicas
790
1067
  ready_replicas = (
@@ -176,7 +176,7 @@ class JobsetBackend(backend.Backend):
176
176
  context = kubernetes_utils.get_current_kube_config_context_name()
177
177
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
178
178
  # TODO(asaiacai): need to set env variables in pod
179
- jobset_utils.create_jobset(
179
+ jobset_response = jobset_utils.create_jobset(
180
180
  namespace,
181
181
  task,
182
182
  pod_spec['kubernetes']['pod_config'],
@@ -192,9 +192,10 @@ class JobsetBackend(backend.Backend):
192
192
  ):
193
193
  _wait_for_jobset_start(namespace, task.name)
194
194
  try:
195
+ assert jobset_response is not None
195
196
  log_thread = threading.Thread(
196
197
  target=log_utils.tail_logs,
197
- args=(task.name,),
198
+ args=(jobset_response,),
198
199
  daemon=True,
199
200
  )
200
201
  logger.info('streaming logs...')
@@ -575,8 +575,15 @@ def show_status_table(
575
575
  ]['containers'][0]['resources']['limits'] # noqa: E501
576
576
  cpu, memory = resources['cpu'], resources['memory']
577
577
  accelerator = job['metadata']['labels'].get(JOBSET_ACCELERATOR_LABEL, None)
578
+ num_accelerators = job['metadata']['labels'].get(
579
+ JOBSET_NUM_ACCELERATORS_LABEL, None
580
+ )
578
581
  if accelerator:
579
- return f'{num_pods}x({cpu}CPU, {memory}MEM, {accelerator})'
582
+ if num_accelerators:
583
+ accelerator_with_count = f'{accelerator}:{num_accelerators}'
584
+ else:
585
+ accelerator_with_count = accelerator
586
+ return f'{num_pods}x({cpu}CPU, {memory}MEM, {accelerator_with_count})'
580
587
  else:
581
588
  return f'{num_pods}x({cpu}CPU, {memory}MEM)'
582
589
 
@@ -1852,11 +1852,21 @@ def serve_down(
1852
1852
  required=False,
1853
1853
  help='Show all deployments, including those not owned by the ' 'current user.',
1854
1854
  )
1855
- def serve_status(all_users: bool):
1855
+ @click.option(
1856
+ '--direct',
1857
+ '-d',
1858
+ default=False,
1859
+ is_flag=True,
1860
+ required=False,
1861
+ help='Force display of direct IP endpoints instead of trainy.us endpoints.',
1862
+ )
1863
+ def serve_status(all_users: bool, direct: bool):
1856
1864
  """Show status of deployments launched via `konduktor serve launch`."""
1857
1865
  context = kubernetes_utils.get_current_kube_config_context_name()
1858
1866
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1859
- deployment_utils.show_status_table(namespace, all_users=all_users)
1867
+ deployment_utils.show_status_table(
1868
+ namespace, all_users=all_users, force_direct=direct
1869
+ )
1860
1870
 
1861
1871
 
1862
1872
  def main():
@@ -0,0 +1,151 @@
1
+ apiVersion: v1
2
+ kind: Secret
3
+ metadata:
4
+ name: trainy-kubeconfig
5
+ namespace: default
6
+ type: Opaque
7
+ data:
8
+ # this gets replaced by buildkite CI secret APOXY_AUTH
9
+ kubeconfig.yaml: |
10
+ APOXY_AUTH
11
+ ---
12
+ apiVersion: v1
13
+ kind: ServiceAccount
14
+ metadata:
15
+ name: kube-controller
16
+ namespace: default
17
+ ---
18
+ apiVersion: rbac.authorization.k8s.io/v1
19
+ kind: ClusterRole
20
+ metadata:
21
+ name: kube-controller-role
22
+ rules:
23
+ - apiGroups: ["apiregistration.k8s.io"]
24
+ resources: ["apiservices"]
25
+ verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
26
+ ---
27
+ apiVersion: rbac.authorization.k8s.io/v1
28
+ kind: ClusterRoleBinding
29
+ metadata:
30
+ name: kube-controller-role-binding
31
+ roleRef:
32
+ apiGroup: rbac.authorization.k8s.io
33
+ kind: ClusterRole
34
+ name: kube-controller-role
35
+ subjects:
36
+ - kind: ServiceAccount
37
+ name: kube-controller
38
+ namespace: default
39
+ ---
40
+ apiVersion: apps/v1
41
+ kind: Deployment
42
+ metadata:
43
+ name: kube-controller
44
+ namespace: default
45
+ labels:
46
+ app: kube-controller
47
+ spec:
48
+ replicas: 1
49
+ selector:
50
+ matchLabels:
51
+ app: kube-controller
52
+ template:
53
+ metadata:
54
+ labels:
55
+ app: kube-controller
56
+ spec:
57
+ containers:
58
+ - name: kube-controller
59
+ image: apoxy/kube-controller:v0.11.6
60
+ args:
61
+ - --dev
62
+ - --project_id=7ce458d7-e20c-443c-aeeb-dbc5663c1240
63
+ - --kubeconfig_path=/data/kubeconfig.yaml
64
+ env:
65
+ - name: POD_NAMESPACE
66
+ valueFrom:
67
+ fieldRef:
68
+ fieldPath: metadata.namespace
69
+ volumeMounts:
70
+ - name: kubeconfig-volume
71
+ mountPath: /data
72
+ readOnly: true
73
+ volumes:
74
+ - name: kubeconfig-volume
75
+ secret:
76
+ secretName: trainy-kubeconfig
77
+ items:
78
+ - key: kubeconfig.yaml
79
+ path: kubeconfig.yaml
80
+ mode: 0600
81
+ serviceAccountName: kube-controller
82
+
83
+ ---
84
+ apiVersion: v1
85
+ kind: Service
86
+ metadata:
87
+ name: kube-controller
88
+ namespace: default
89
+ labels:
90
+ app: kube-controller
91
+ spec:
92
+ selector:
93
+ app: kube-controller
94
+ ports:
95
+ - name: http
96
+ protocol: TCP
97
+ port: 8443
98
+ targetPort: 8443
99
+ ---
100
+ apiVersion: v1
101
+ kind: ConfigMap
102
+ metadata:
103
+ name: apoxy-config
104
+ namespace: default
105
+ data:
106
+ config.yaml: |
107
+ apiVersion: config.apoxy.dev/v1alpha1
108
+ kind: Config
109
+ currentProject: 7ce458d7-e20c-443c-aeeb-dbc5663c1240
110
+ projects:
111
+ - id: 7ce458d7-e20c-443c-aeeb-dbc5663c1240
112
+ kubernetesConfig:
113
+ kubeconfigPath: /root/kubeconfig.yaml
114
+ tunnel:
115
+ mode: userspace
116
+ ---
117
+ apiVersion: apps/v1
118
+ kind: Deployment
119
+ metadata:
120
+ name: apoxy
121
+ namespace: default
122
+ labels:
123
+ app: apoxy
124
+ spec:
125
+ replicas: 1
126
+ selector:
127
+ matchLabels:
128
+ app: apoxy
129
+ template:
130
+ metadata:
131
+ labels:
132
+ app: apoxy
133
+ spec:
134
+ containers:
135
+ - name: apoxy
136
+ image: apoxy/apoxy:v0.11.10
137
+ command: ["apoxy", "tunnel", "run", "UNIQUE-TEMPNAME", "--insecure-skip-verify"]
138
+ volumeMounts:
139
+ - name: kubeconfig-volume
140
+ mountPath: /root/kubeconfig.yaml
141
+ subPath: kubeconfig.yaml
142
+ - name: apoxy-config-volume
143
+ mountPath: /root/.apoxy/config.yaml
144
+ subPath: config.yaml
145
+ volumes:
146
+ - name: kubeconfig-volume
147
+ secret:
148
+ secretName: trainy-kubeconfig
149
+ - name: apoxy-config-volume
150
+ configMap:
151
+ name: apoxy-config
@@ -0,0 +1,34 @@
1
+ apiVersion: core.apoxy.dev/v1alpha
2
+ kind: TunnelNode
3
+ metadata:
4
+ name: UNIQUE-TEMPNAME
5
+ spec:
6
+ egressGateway:
7
+ enabled: true
8
+ ---
9
+ # Add just your backend for aibrix
10
+ apiVersion: core.apoxy.dev/v1alpha
11
+ kind: Backend
12
+ metadata:
13
+ name: UNIQUE-TEMPNAME-backend
14
+ spec:
15
+ endpoints:
16
+ - fqdn: envoy-aibrix-system-aibrix-eg-903790dc.envoy-gateway-system.UNIQUE-TEMPNAME.tun.apoxy.net
17
+ ---
18
+ # Add just your route for aibrix
19
+ apiVersion: gateway.apoxy.dev/v1
20
+ kind: HTTPRoute
21
+ metadata:
22
+ name: UNIQUE-TEMPNAME-route
23
+ spec:
24
+ parentRefs:
25
+ - name: default
26
+ kind: Gateway
27
+ port: 443
28
+ hostnames:
29
+ - 'TEMPNAME.trainy.us'
30
+ rules:
31
+ - backendRefs:
32
+ - kind: Backend
33
+ name: UNIQUE-TEMPNAME-backend
34
+ port: 80
@@ -0,0 +1,33 @@
1
+ ---
2
+ # Apoxy Backend for general deployment
3
+ apiVersion: core.apoxy.dev/v1alpha
4
+ kind: Backend
5
+ metadata:
6
+ name: {{ unique_cluster_name }}-backend-{{ deployment_number }}
7
+ labels:
8
+ task_name: {{ name }}
9
+ endpoint_name: {{ cluster_name }}-{{ deployment_number }}.trainy.us
10
+ spec:
11
+ endpoints:
12
+ - fqdn: {{ name }}.default.{{ unique_cluster_name }}.tun.apoxy.net
13
+ ---
14
+ # Apoxy Route for general deployment
15
+ apiVersion: gateway.apoxy.dev/v1
16
+ kind: HTTPRoute
17
+ metadata:
18
+ name: {{ unique_cluster_name }}-route-{{ deployment_number }}
19
+ labels:
20
+ task_name: {{ name }}
21
+ endpoint_name: {{ cluster_name }}-{{ deployment_number }}.trainy.us
22
+ spec:
23
+ parentRefs:
24
+ - name: default
25
+ kind: Gateway
26
+ port: 443
27
+ hostnames:
28
+ - '{{ cluster_name }}-{{ deployment_number }}.trainy.us'
29
+ rules:
30
+ - backendRefs:
31
+ - kind: Backend
32
+ name: {{ unique_cluster_name }}-backend-{{ deployment_number }}
33
+ port: {{ ports }}
@@ -578,6 +578,19 @@ def get_config_schema():
578
578
  },
579
579
  }
580
580
 
581
+ serving_configs = {
582
+ 'type': 'object',
583
+ 'required': [],
584
+ 'additionalProperties': False,
585
+ 'properties': {
586
+ 'endpoint': {
587
+ 'type': 'string',
588
+ 'case_insensitive_enum': ['trainy', 'direct'],
589
+ 'default': 'trainy',
590
+ },
591
+ },
592
+ }
593
+
581
594
  for cloud, config in cloud_configs.items():
582
595
  if cloud == 'kubernetes':
583
596
  config['properties'].update(_REMOTE_IDENTITY_SCHEMA_KUBERNETES)
@@ -595,6 +608,7 @@ def get_config_schema():
595
608
  'logs': logs_configs,
596
609
  'tailscale': tailscale_configs,
597
610
  'ssh': ssh_configs,
611
+ 'serving': serving_configs,
598
612
  **cloud_configs,
599
613
  },
600
614
  }
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "konduktor-nightly"
3
- version = "0.1.0.dev20250825104841"
3
+ version = "0.1.0.dev20250827104553"
4
4
  description = "GPU Cluster Health Management"
5
5
  packages = [
6
6
  {include = "konduktor"}