konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
konduktor/logging.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Logging utilities."""
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import threading
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
|
|
9
|
+
import colorama
|
|
10
|
+
|
|
11
|
+
from konduktor import constants
|
|
12
|
+
|
|
13
|
+
CHECK_MARK_EMOJI = '\U00002714' # Heavy check mark unicode
|
|
14
|
+
PARTY_POPPER_EMOJI = '\U0001f389' # Party popper unicode
|
|
15
|
+
|
|
16
|
+
_FORMAT = '[%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
|
|
17
|
+
_DATE_FORMAT = '%m-%d %H:%M:%S'
|
|
18
|
+
|
|
19
|
+
_logging_config = threading.local()
|
|
20
|
+
_log_path = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class NewLineFormatter(logging.Formatter):
|
|
24
|
+
"""Adds logging prefix to newlines to align multi-line messages."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, fmt, datefmt=None, dim=False):
|
|
27
|
+
logging.Formatter.__init__(self, fmt, datefmt)
|
|
28
|
+
self.dim = dim
|
|
29
|
+
|
|
30
|
+
def format(self, record):
|
|
31
|
+
msg = logging.Formatter.format(self, record)
|
|
32
|
+
if record.message != '':
|
|
33
|
+
parts = msg.partition(record.message)
|
|
34
|
+
msg = msg.replace('\n', '\r\n' + parts[0])
|
|
35
|
+
if self.dim:
|
|
36
|
+
msg = colorama.Style.DIM + msg + colorama.Style.RESET_ALL
|
|
37
|
+
return msg
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
FORMATTER = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@contextlib.contextmanager
|
|
44
|
+
def set_logging_level(logger: str, level: int):
|
|
45
|
+
logger = logging.getLogger(logger)
|
|
46
|
+
original_level = logger.level
|
|
47
|
+
logger.setLevel(level)
|
|
48
|
+
try:
|
|
49
|
+
yield
|
|
50
|
+
finally:
|
|
51
|
+
logger.setLevel(original_level)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_logger(name: str):
|
|
55
|
+
global _log_path
|
|
56
|
+
|
|
57
|
+
logger = logging.getLogger(name)
|
|
58
|
+
|
|
59
|
+
# Avoid duplicate handlers
|
|
60
|
+
if logger.hasHandlers():
|
|
61
|
+
return logger
|
|
62
|
+
|
|
63
|
+
logger.setLevel(logging.DEBUG) # Always capture all levels internally
|
|
64
|
+
|
|
65
|
+
# --- File logging: Always enabled ---
|
|
66
|
+
if not _log_path:
|
|
67
|
+
log_dir = os.path.expanduser('~/.konduktor/logs')
|
|
68
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
69
|
+
timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
|
|
70
|
+
_log_path = os.path.join(log_dir, f'konduktor-logs-{timestamp}.log')
|
|
71
|
+
print(f'Log file: {_log_path}')
|
|
72
|
+
|
|
73
|
+
fh = logging.FileHandler(_log_path)
|
|
74
|
+
fh.setLevel(logging.DEBUG)
|
|
75
|
+
fh.setFormatter(FORMATTER)
|
|
76
|
+
logger.addHandler(fh)
|
|
77
|
+
|
|
78
|
+
# --- Console logging: INFO level by default, DEBUG if KONDUKTOR_DEBUG=1 ---
|
|
79
|
+
ch = logging.StreamHandler()
|
|
80
|
+
if os.environ.get('KONDUKTOR_DEBUG') == '1':
|
|
81
|
+
ch.setLevel(logging.DEBUG)
|
|
82
|
+
else:
|
|
83
|
+
ch.setLevel(logging.INFO)
|
|
84
|
+
ch.setFormatter(FORMATTER)
|
|
85
|
+
logger.addHandler(ch)
|
|
86
|
+
|
|
87
|
+
logger.propagate = False
|
|
88
|
+
return logger
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def is_silent():
|
|
92
|
+
if not hasattr(_logging_config, 'is_silent'):
|
|
93
|
+
# Should not set it globally, as the global assignment
|
|
94
|
+
# will be executed only once if the module is imported
|
|
95
|
+
# in the main thread, and will not be executed in other
|
|
96
|
+
# threads.
|
|
97
|
+
_logging_config.is_silent = False
|
|
98
|
+
return _logging_config.is_silent
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def get_run_timestamp() -> str:
|
|
102
|
+
return 'konduktor-' + datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def generate_tmp_logging_file_path(file_name: str) -> str:
|
|
106
|
+
"""Generate an absolute path of a tmp file for logging."""
|
|
107
|
+
run_timestamp = get_run_timestamp()
|
|
108
|
+
log_dir = os.path.join(constants.KONDUKTOR_LOGS_DIRECTORY, run_timestamp)
|
|
109
|
+
log_path = os.path.expanduser(os.path.join(log_dir, file_name))
|
|
110
|
+
|
|
111
|
+
return log_path
|
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
# Aibrix Setup - vLLM Deployment Infrastructure
|
|
2
|
+
#
|
|
3
|
+
# This file sets up the infrastructure needed for vLLM (Aibrix) deployments:
|
|
4
|
+
# 1. Envoy Gateway configuration for HTTP routing
|
|
5
|
+
# 2. Aibrix Activator service for request-based autoscaling (KPA)
|
|
6
|
+
# 3. HTTP route mirroring for prewarming vLLM models
|
|
7
|
+
# 4. Lua script for extracting model names from OpenAI-compatible requests
|
|
8
|
+
#
|
|
9
|
+
# The activator tracks incoming requests and provides metrics to scale
|
|
10
|
+
# vLLM deployments based on demand (requests per second).
|
|
11
|
+
|
|
12
|
+
# This file is kept separate from apoxy setup files because it is
|
|
13
|
+
# only used in actual clusters, not in the test kind clusters.
|
|
14
|
+
|
|
15
|
+
apiVersion: v1
|
|
16
|
+
kind: ConfigMap
|
|
17
|
+
metadata:
|
|
18
|
+
name: envoy-gateway-config
|
|
19
|
+
namespace: envoy-gateway-system
|
|
20
|
+
data:
|
|
21
|
+
envoy-gateway.yaml: |
|
|
22
|
+
apiVersion: gateway.envoyproxy.io/v1alpha1
|
|
23
|
+
kind: EnvoyGateway
|
|
24
|
+
provider:
|
|
25
|
+
type: Kubernetes
|
|
26
|
+
gateway:
|
|
27
|
+
controllerName: gateway.envoyproxy.io/gatewayclass-controller
|
|
28
|
+
extensionApis:
|
|
29
|
+
enableEnvoyPatchPolicy: true
|
|
30
|
+
---
|
|
31
|
+
apiVersion: v1
|
|
32
|
+
kind: Namespace
|
|
33
|
+
metadata:
|
|
34
|
+
name: aibrix-activator
|
|
35
|
+
---
|
|
36
|
+
apiVersion: v1
|
|
37
|
+
kind: ServiceAccount
|
|
38
|
+
metadata:
|
|
39
|
+
name: aibrix-activator
|
|
40
|
+
namespace: aibrix-activator
|
|
41
|
+
---
|
|
42
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
43
|
+
kind: ClusterRole
|
|
44
|
+
metadata:
|
|
45
|
+
name: aibrix-activator
|
|
46
|
+
rules:
|
|
47
|
+
- apiGroups: ["apps"]
|
|
48
|
+
resources: ["deployments"]
|
|
49
|
+
verbs: ["get", "list", "watch"]
|
|
50
|
+
---
|
|
51
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
52
|
+
kind: ClusterRoleBinding
|
|
53
|
+
metadata:
|
|
54
|
+
name: aibrix-activator
|
|
55
|
+
roleRef:
|
|
56
|
+
apiGroup: rbac.authorization.k8s.io
|
|
57
|
+
kind: ClusterRole
|
|
58
|
+
name: aibrix-activator
|
|
59
|
+
subjects:
|
|
60
|
+
- kind: ServiceAccount
|
|
61
|
+
name: aibrix-activator
|
|
62
|
+
namespace: aibrix-activator
|
|
63
|
+
---
|
|
64
|
+
apiVersion: v1
|
|
65
|
+
kind: ConfigMap
|
|
66
|
+
metadata:
|
|
67
|
+
name: activator-code
|
|
68
|
+
namespace: aibrix-activator
|
|
69
|
+
data:
|
|
70
|
+
activator.py: |
|
|
71
|
+
import os, time, json
|
|
72
|
+
from collections import defaultdict, deque
|
|
73
|
+
from fastapi import FastAPI, Request
|
|
74
|
+
from fastapi.responses import PlainTextResponse, JSONResponse
|
|
75
|
+
import asyncio
|
|
76
|
+
from kubernetes import client, config
|
|
77
|
+
|
|
78
|
+
NAMESPACE = os.getenv("NAMESPACE", "default")
|
|
79
|
+
WINDOW_SEC = int(os.getenv("WINDOW_SEC", "30")) # demand lookback
|
|
80
|
+
CAPACITY_RPS = float(os.getenv("CAPACITY_RPS", "1.0")) # per-replica capacity
|
|
81
|
+
MIN_WAKE = int(os.getenv("MIN_REPLICA_ON_WAKE", "1"))
|
|
82
|
+
MAX_REPLICAS = int(os.getenv("MAX_REPLICAS", "8"))
|
|
83
|
+
CLEANUP_INTERVAL = int(os.getenv("CLEANUP_INTERVAL", "300")) # 5 minutes
|
|
84
|
+
|
|
85
|
+
app = FastAPI()
|
|
86
|
+
events = defaultdict(deque) # key=(ns,model) -> deque[timestamps]
|
|
87
|
+
|
|
88
|
+
# Initialize Kubernetes client
|
|
89
|
+
try:
|
|
90
|
+
config.load_incluster_config()
|
|
91
|
+
k8s_apps_v1 = client.AppsV1Api()
|
|
92
|
+
except:
|
|
93
|
+
k8s_apps_v1 = None
|
|
94
|
+
|
|
95
|
+
def _prune(q, now):
|
|
96
|
+
while q and now - q[0] > WINDOW_SEC: q.popleft()
|
|
97
|
+
|
|
98
|
+
def _bump(ns, model):
|
|
99
|
+
now = time.time()
|
|
100
|
+
q = events[(ns, model)]
|
|
101
|
+
q.append(now)
|
|
102
|
+
_prune(q, now)
|
|
103
|
+
|
|
104
|
+
def _desired(ns, model):
|
|
105
|
+
now = time.time()
|
|
106
|
+
q = events[(ns, model)]
|
|
107
|
+
_prune(q, now)
|
|
108
|
+
rps = len(q) / max(WINDOW_SEC, 1)
|
|
109
|
+
if len(q) == 0: return 0
|
|
110
|
+
# Convert demand to desired replicas
|
|
111
|
+
import math
|
|
112
|
+
d = max(MIN_WAKE, math.ceil(rps / max(CAPACITY_RPS, 1e-6)))
|
|
113
|
+
return max(0, min(d, MAX_REPLICAS))
|
|
114
|
+
|
|
115
|
+
def _extract_model(headers, body_bytes):
|
|
116
|
+
# Prefer header (OpenAI-compatible)
|
|
117
|
+
m = headers.get("model") or headers.get("x-model")
|
|
118
|
+
if m: return m
|
|
119
|
+
# Try JSON body
|
|
120
|
+
try:
|
|
121
|
+
j = json.loads(body_bytes or b"{}")
|
|
122
|
+
if isinstance(j, dict):
|
|
123
|
+
# OpenAI schema: {"model": "...", ...}
|
|
124
|
+
if "model" in j and isinstance(j["model"], str):
|
|
125
|
+
return j["model"]
|
|
126
|
+
except Exception:
|
|
127
|
+
pass
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
def _get_existing_deployments():
|
|
131
|
+
"""Get list of existing Aibrix deployments from Kubernetes"""
|
|
132
|
+
if not k8s_apps_v1:
|
|
133
|
+
return set()
|
|
134
|
+
try:
|
|
135
|
+
deployments = k8s_apps_v1.list_namespaced_deployment(
|
|
136
|
+
namespace=NAMESPACE,
|
|
137
|
+
label_selector="model.aibrix.ai/name"
|
|
138
|
+
)
|
|
139
|
+
return {d.metadata.name for d in deployments.items}
|
|
140
|
+
except Exception:
|
|
141
|
+
return set()
|
|
142
|
+
|
|
143
|
+
def _cleanup_stale_entries():
|
|
144
|
+
"""Remove entries for deployments that no longer exist"""
|
|
145
|
+
if not k8s_apps_v1:
|
|
146
|
+
return
|
|
147
|
+
try:
|
|
148
|
+
existing_deployments = _get_existing_deployments()
|
|
149
|
+
# Remove entries for deployments that no longer exist
|
|
150
|
+
keys_to_remove = []
|
|
151
|
+
for (ns, model) in list(events.keys()):
|
|
152
|
+
if ns == NAMESPACE and model not in existing_deployments:
|
|
153
|
+
keys_to_remove.append((ns, model))
|
|
154
|
+
|
|
155
|
+
for key in keys_to_remove:
|
|
156
|
+
del events[key]
|
|
157
|
+
print(f"Cleaned up stale entry for deployment: {key[1]}")
|
|
158
|
+
except Exception as e:
|
|
159
|
+
print(f"Error during cleanup: {e}")
|
|
160
|
+
|
|
161
|
+
async def _cleanup_task():
|
|
162
|
+
"""Background task to periodically clean up stale entries"""
|
|
163
|
+
while True:
|
|
164
|
+
await asyncio.sleep(CLEANUP_INTERVAL)
|
|
165
|
+
_cleanup_stale_entries()
|
|
166
|
+
|
|
167
|
+
@app.on_event("startup")
|
|
168
|
+
async def startup_event():
|
|
169
|
+
"""Start background cleanup task"""
|
|
170
|
+
asyncio.create_task(_cleanup_task())
|
|
171
|
+
|
|
172
|
+
# Mirror endpoints (same as your API paths); quick 204 response
|
|
173
|
+
@app.post("/v1/completions")
|
|
174
|
+
@app.post("/v1/chat/completions")
|
|
175
|
+
async def mirrored(request: Request):
|
|
176
|
+
body = await request.body()
|
|
177
|
+
model = _extract_model(request.headers, body)
|
|
178
|
+
if model:
|
|
179
|
+
_bump(NAMESPACE, model)
|
|
180
|
+
return JSONResponse({"ok": True}, status_code=204)
|
|
181
|
+
|
|
182
|
+
# Catch-all POST (safety net if your gateway uses different paths)
|
|
183
|
+
@app.post("/{full_path:path}")
|
|
184
|
+
async def mirrored_generic(request: Request, full_path: str):
|
|
185
|
+
body = await request.body()
|
|
186
|
+
model = _extract_model(request.headers, body)
|
|
187
|
+
if model:
|
|
188
|
+
_bump(NAMESPACE, model)
|
|
189
|
+
return JSONResponse({"ok": True}, status_code=204)
|
|
190
|
+
|
|
191
|
+
# Prometheus-friendly aggregate endpoint: export ALL (ns, model)
|
|
192
|
+
@app.get("/metrics", response_class=PlainTextResponse)
|
|
193
|
+
async def metrics_all():
|
|
194
|
+
lines = []
|
|
195
|
+
# Idiomatic names
|
|
196
|
+
lines.append("# HELP vllm_deployment_replicas Number of suggested replicas.")
|
|
197
|
+
lines.append("# TYPE vllm_deployment_replicas gauge")
|
|
198
|
+
lines.append("# HELP vllm_observed_rps Incoming requests per second.")
|
|
199
|
+
lines.append("# TYPE vllm_observed_rps gauge")
|
|
200
|
+
now = time.time()
|
|
201
|
+
for (ns, model), q in list(events.items()):
|
|
202
|
+
_prune(q, now)
|
|
203
|
+
rps = len(q) / max(WINDOW_SEC, 1)
|
|
204
|
+
d = _desired(ns, model)
|
|
205
|
+
lines.append(f'vllm_deployment_replicas{{namespace="{ns}",model_name="{model}"}} {d}')
|
|
206
|
+
lines.append(f'vllm_observed_rps{{namespace="{ns}",model_name="{model}"}} {rps:.6f}')
|
|
207
|
+
# (Optional) keep legacy names with colons for back-compat
|
|
208
|
+
lines.append("# HELP vllm:deployment_replicas Number of suggested replicas.")
|
|
209
|
+
lines.append("# TYPE vllm:deployment_replicas gauge")
|
|
210
|
+
lines.append("# HELP vllm:observed_rps Incoming requests per second.")
|
|
211
|
+
lines.append("# TYPE vllm:observed_rps gauge")
|
|
212
|
+
now = time.time()
|
|
213
|
+
for (ns, model), q in list(events.items()):
|
|
214
|
+
_prune(q, now)
|
|
215
|
+
rps = len(q) / max(WINDOW_SEC, 1)
|
|
216
|
+
d = _desired(ns, model)
|
|
217
|
+
lines.append(f'vllm:deployment_replicas{{namespace="{ns}",model_name="{model}"}} {d}')
|
|
218
|
+
lines.append(f'vllm:observed_rps{{namespace="{ns}",model_name="{model}"}} {rps:.6f}')
|
|
219
|
+
return "\n".join(lines) + "\n"
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# Metrics for KPA and Debugging
|
|
223
|
+
@app.get("/metrics/{ns}/{model}", response_class=PlainTextResponse)
|
|
224
|
+
async def metrics(ns: str, model: str):
|
|
225
|
+
d = _desired(ns, model)
|
|
226
|
+
now = time.time()
|
|
227
|
+
q = events[(ns, model)]
|
|
228
|
+
_prune(q, now)
|
|
229
|
+
rps = len(q) / max(WINDOW_SEC, 1)
|
|
230
|
+
return (
|
|
231
|
+
"# HELP vllm:deployment_replicas Number of suggested replicas.\n"
|
|
232
|
+
"# TYPE vllm:deployment_replicas gauge\n"
|
|
233
|
+
f'vllm:deployment_replicas{{namespace="{ns}",model_name="{model}"}} {d}\n'
|
|
234
|
+
"# HELP vllm:observed_rps Incoming requests per second.\n"
|
|
235
|
+
"# TYPE vllm:observed_rps gauge\n"
|
|
236
|
+
f'vllm:observed_rps{{namespace="{ns}",model_name="{model}"}} {rps:.2f}\n'
|
|
237
|
+
)
|
|
238
|
+
---
|
|
239
|
+
apiVersion: apps/v1
|
|
240
|
+
kind: Deployment
|
|
241
|
+
metadata:
|
|
242
|
+
name: aibrix-activator
|
|
243
|
+
namespace: aibrix-activator
|
|
244
|
+
spec:
|
|
245
|
+
replicas: 1
|
|
246
|
+
selector: { matchLabels: { app: aibrix-activator } }
|
|
247
|
+
template:
|
|
248
|
+
metadata: { labels: { app: aibrix-activator } }
|
|
249
|
+
spec:
|
|
250
|
+
containers:
|
|
251
|
+
- name: activator
|
|
252
|
+
image: python:3.11-slim
|
|
253
|
+
command: ["bash","-lc"]
|
|
254
|
+
args:
|
|
255
|
+
- |
|
|
256
|
+
pip install fastapi uvicorn kubernetes >/dev/null && \
|
|
257
|
+
uvicorn activator:app --host 0.0.0.0 --port 8080
|
|
258
|
+
env:
|
|
259
|
+
- { name: NAMESPACE, value: "default" }
|
|
260
|
+
- { name: WINDOW_SEC, value: "30" }
|
|
261
|
+
- { name: CAPACITY_RPS, value: "1.0" }
|
|
262
|
+
- { name: MIN_REPLICA_ON_WAKE, value: "1" }
|
|
263
|
+
- { name: MAX_REPLICAS, value: "8" }
|
|
264
|
+
- { name: CLEANUP_INTERVAL, value: "300" }
|
|
265
|
+
ports: [{containerPort: 8080}]
|
|
266
|
+
volumeMounts:
|
|
267
|
+
- { name: code, mountPath: /app/activator.py, subPath: activator.py }
|
|
268
|
+
workingDir: /app
|
|
269
|
+
serviceAccountName: aibrix-activator
|
|
270
|
+
volumes:
|
|
271
|
+
- name: code
|
|
272
|
+
configMap: { name: activator-code }
|
|
273
|
+
---
|
|
274
|
+
apiVersion: v1
|
|
275
|
+
kind: Service
|
|
276
|
+
metadata:
|
|
277
|
+
name: aibrix-activator
|
|
278
|
+
namespace: aibrix-activator
|
|
279
|
+
annotations:
|
|
280
|
+
prometheus.io/scrape: "true"
|
|
281
|
+
prometheus.io/port: "8080"
|
|
282
|
+
prometheus.io/path: "/metrics"
|
|
283
|
+
labels:
|
|
284
|
+
app: aibrix-activator
|
|
285
|
+
prometheus-discovery: "true"
|
|
286
|
+
spec:
|
|
287
|
+
selector: { app: aibrix-activator }
|
|
288
|
+
ports:
|
|
289
|
+
- name: http
|
|
290
|
+
port: 8080
|
|
291
|
+
targetPort: 8080
|
|
292
|
+
protocol: TCP
|
|
293
|
+
type: ClusterIP
|
|
294
|
+
---
|
|
295
|
+
apiVersion: monitoring.coreos.com/v1
|
|
296
|
+
kind: ServiceMonitor
|
|
297
|
+
metadata:
|
|
298
|
+
name: aibrix-activator
|
|
299
|
+
namespace: prometheus
|
|
300
|
+
labels:
|
|
301
|
+
app: aibrix-activator
|
|
302
|
+
spec:
|
|
303
|
+
selector:
|
|
304
|
+
matchLabels:
|
|
305
|
+
app: aibrix-activator
|
|
306
|
+
namespaceSelector:
|
|
307
|
+
matchNames:
|
|
308
|
+
- aibrix-activator
|
|
309
|
+
endpoints:
|
|
310
|
+
- port: http
|
|
311
|
+
path: /metrics
|
|
312
|
+
---
|
|
313
|
+
apiVersion: monitoring.coreos.com/v1
|
|
314
|
+
kind: ServiceMonitor
|
|
315
|
+
metadata:
|
|
316
|
+
name: vllm-deployments
|
|
317
|
+
namespace: prometheus
|
|
318
|
+
labels:
|
|
319
|
+
app: vllm-deployments
|
|
320
|
+
spec:
|
|
321
|
+
selector:
|
|
322
|
+
matchLabels:
|
|
323
|
+
prometheus-discovery: "true"
|
|
324
|
+
namespaceSelector:
|
|
325
|
+
matchNames:
|
|
326
|
+
- default
|
|
327
|
+
endpoints:
|
|
328
|
+
- port: serve
|
|
329
|
+
path: /metrics
|
|
330
|
+
---
|
|
331
|
+
apiVersion: gateway.networking.k8s.io/v1beta1
|
|
332
|
+
kind: ReferenceGrant
|
|
333
|
+
metadata:
|
|
334
|
+
name: allow-httproute-to-activator
|
|
335
|
+
namespace: aibrix-activator
|
|
336
|
+
spec:
|
|
337
|
+
from:
|
|
338
|
+
- group: gateway.networking.k8s.io
|
|
339
|
+
kind: HTTPRoute
|
|
340
|
+
namespace: aibrix-system
|
|
341
|
+
to:
|
|
342
|
+
- group: ""
|
|
343
|
+
kind: Service
|
|
344
|
+
name: aibrix-activator
|
|
345
|
+
---
|
|
346
|
+
apiVersion: gateway.networking.k8s.io/v1
|
|
347
|
+
kind: HTTPRoute
|
|
348
|
+
metadata:
|
|
349
|
+
name: activator-mirror-sink
|
|
350
|
+
namespace: aibrix-system
|
|
351
|
+
spec:
|
|
352
|
+
parentRefs:
|
|
353
|
+
- group: gateway.networking.k8s.io
|
|
354
|
+
kind: Gateway
|
|
355
|
+
name: aibrix-eg
|
|
356
|
+
namespace: aibrix-system
|
|
357
|
+
rules:
|
|
358
|
+
- matches:
|
|
359
|
+
- path:
|
|
360
|
+
type: PathPrefix
|
|
361
|
+
value: /__activator_sink__
|
|
362
|
+
backendRefs:
|
|
363
|
+
- name: aibrix-activator
|
|
364
|
+
namespace: aibrix-activator
|
|
365
|
+
port: 8080
|
|
366
|
+
---
|
|
367
|
+
apiVersion: gateway.envoyproxy.io/v1alpha1
|
|
368
|
+
kind: EnvoyPatchPolicy
|
|
369
|
+
metadata:
|
|
370
|
+
name: prewarm-completions-lua
|
|
371
|
+
namespace: aibrix-system
|
|
372
|
+
spec:
|
|
373
|
+
targetRef:
|
|
374
|
+
group: gateway.networking.k8s.io
|
|
375
|
+
kind: Gateway
|
|
376
|
+
name: aibrix-eg
|
|
377
|
+
type: JSONPatch
|
|
378
|
+
jsonPatches:
|
|
379
|
+
- type: "type.googleapis.com/envoy.config.listener.v3.Listener"
|
|
380
|
+
name: "aibrix-system/aibrix-eg/http"
|
|
381
|
+
operation:
|
|
382
|
+
op: add
|
|
383
|
+
path: "/default_filter_chain/filters/0/typed_config/http_filters/0"
|
|
384
|
+
value:
|
|
385
|
+
name: envoy.filters.http.lua
|
|
386
|
+
typed_config:
|
|
387
|
+
"@type": type.googleapis.com/envoy.extensions.filters.http.lua.v3.Lua
|
|
388
|
+
inlineCode: |
|
|
389
|
+
function envoy_on_request(handle)
|
|
390
|
+
local path = handle:headers():get(":path") or ""
|
|
391
|
+
if string.find(path, "^/v1/completions") or string.find(path, "^/v1/chat/completions") then
|
|
392
|
+
-- Try to get model from header first
|
|
393
|
+
local model = handle:headers():get("model") or ""
|
|
394
|
+
|
|
395
|
+
-- If no model in header, try to extract from JSON body
|
|
396
|
+
if model == "" then
|
|
397
|
+
local ct = handle:headers():get("content-type") or ""
|
|
398
|
+
if string.find(ct:lower(), "application/json") then
|
|
399
|
+
local body = handle:body()
|
|
400
|
+
if body and body:length() > 0 then
|
|
401
|
+
local raw = body:getBytes(0, math.min(body:length(), 1024))
|
|
402
|
+
-- Simple regex to extract model from JSON: "model":"value"
|
|
403
|
+
local model_match = raw:match('"model"%s*:%s*"([^"]+)"')
|
|
404
|
+
if model_match then
|
|
405
|
+
model = model_match
|
|
406
|
+
end
|
|
407
|
+
end
|
|
408
|
+
end
|
|
409
|
+
end
|
|
410
|
+
|
|
411
|
+
-- Only proceed if we have a model
|
|
412
|
+
if model ~= "" then
|
|
413
|
+
-- fire-and-forget wake signal; very short timeout
|
|
414
|
+
pcall(function()
|
|
415
|
+
handle:httpCall(
|
|
416
|
+
"httproute/aibrix-system/activator-mirror-sink/rule/0",
|
|
417
|
+
{
|
|
418
|
+
[":method"] = "POST",
|
|
419
|
+
[":path"] = "/v1/completions",
|
|
420
|
+
[":authority"] = "aibrix-activator.aibrix-activator.svc.cluster.local",
|
|
421
|
+
["content-type"] = "application/json",
|
|
422
|
+
["model"] = model
|
|
423
|
+
},
|
|
424
|
+
"{}",
|
|
425
|
+
5 -- ms
|
|
426
|
+
)
|
|
427
|
+
end)
|
|
428
|
+
end
|
|
429
|
+
end
|
|
430
|
+
end
|