llm-bench-studio 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_bench_studio-0.1.0/PKG-INFO +22 -0
- llm_bench_studio-0.1.0/backend/__init__.py +1 -0
- llm_bench_studio-0.1.0/backend/api/__init__.py +17 -0
- llm_bench_studio-0.1.0/backend/api/benchmarks.py +438 -0
- llm_bench_studio-0.1.0/backend/api/connections.py +87 -0
- llm_bench_studio-0.1.0/backend/api/prompts.py +139 -0
- llm_bench_studio-0.1.0/backend/api/reports.py +104 -0
- llm_bench_studio-0.1.0/backend/api/system.py +53 -0
- llm_bench_studio-0.1.0/backend/api/websocket.py +65 -0
- llm_bench_studio-0.1.0/backend/benchmark/engine.py +404 -0
- llm_bench_studio-0.1.0/backend/benchmark/runner.py +282 -0
- llm_bench_studio-0.1.0/backend/benchmark/scheduler.py +85 -0
- llm_bench_studio-0.1.0/backend/benchmark/stream_parser.py +85 -0
- llm_bench_studio-0.1.0/backend/cli.py +59 -0
- llm_bench_studio-0.1.0/backend/config/__init__.py +5 -0
- llm_bench_studio-0.1.0/backend/config/settings.py +55 -0
- llm_bench_studio-0.1.0/backend/database/__init__.py +19 -0
- llm_bench_studio-0.1.0/backend/database/engine.py +88 -0
- llm_bench_studio-0.1.0/backend/database/repository.py +162 -0
- llm_bench_studio-0.1.0/backend/main.py +138 -0
- llm_bench_studio-0.1.0/backend/metrics/calculator.py +107 -0
- llm_bench_studio-0.1.0/backend/models/__init__.py +10 -0
- llm_bench_studio-0.1.0/backend/models/benchmark_request.py +51 -0
- llm_bench_studio-0.1.0/backend/models/benchmark_run.py +56 -0
- llm_bench_studio-0.1.0/backend/models/connection.py +49 -0
- llm_bench_studio-0.1.0/backend/models/prompt.py +52 -0
- llm_bench_studio-0.1.0/backend/report/__init__.py +1 -0
- llm_bench_studio-0.1.0/backend/report/excel.py +247 -0
- llm_bench_studio-0.1.0/backend/report/html_report.py +448 -0
- llm_bench_studio-0.1.0/backend/report/pdf.py +223 -0
- llm_bench_studio-0.1.0/backend/services/__init__.py +32 -0
- llm_bench_studio-0.1.0/backend/services/connection_service.py +286 -0
- llm_bench_studio-0.1.0/backend/services/prompt_service.py +325 -0
- llm_bench_studio-0.1.0/backend/static/assets/geist-cyrillic-ext-wght-normal-DjL33-gN.woff2 +0 -0
- llm_bench_studio-0.1.0/backend/static/assets/geist-cyrillic-wght-normal-BEAKL7Jp.woff2 +0 -0
- llm_bench_studio-0.1.0/backend/static/assets/geist-latin-ext-wght-normal-DC-KSUi6.woff2 +0 -0
- llm_bench_studio-0.1.0/backend/static/assets/geist-latin-wght-normal-BgDaEnEv.woff2 +0 -0
- llm_bench_studio-0.1.0/backend/static/assets/geist-vietnamese-wght-normal-6IgcOCM7.woff2 +0 -0
- llm_bench_studio-0.1.0/backend/static/assets/index-C-ytfvik.css +2 -0
- llm_bench_studio-0.1.0/backend/static/assets/index-twItFkHg.js +205 -0
- llm_bench_studio-0.1.0/backend/static/favicon.svg +1 -0
- llm_bench_studio-0.1.0/backend/static/icons.svg +24 -0
- llm_bench_studio-0.1.0/backend/static/index.html +16 -0
- llm_bench_studio-0.1.0/backend/utils/__init__.py +36 -0
- llm_bench_studio-0.1.0/backend/utils/errors.py +67 -0
- llm_bench_studio-0.1.0/backend/utils/helpers.py +81 -0
- llm_bench_studio-0.1.0/backend/utils/logger.py +86 -0
- llm_bench_studio-0.1.0/llm_bench_studio.egg-info/PKG-INFO +22 -0
- llm_bench_studio-0.1.0/llm_bench_studio.egg-info/SOURCES.txt +53 -0
- llm_bench_studio-0.1.0/llm_bench_studio.egg-info/dependency_links.txt +1 -0
- llm_bench_studio-0.1.0/llm_bench_studio.egg-info/entry_points.txt +2 -0
- llm_bench_studio-0.1.0/llm_bench_studio.egg-info/requires.txt +12 -0
- llm_bench_studio-0.1.0/llm_bench_studio.egg-info/top_level.txt +1 -0
- llm_bench_studio-0.1.0/pyproject.toml +50 -0
- llm_bench_studio-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-bench-studio
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Production-grade benchmarking platform for OpenAI-compatible LLM endpoints
|
|
5
|
+
License: MIT
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: fastapi>=0.115.0
|
|
12
|
+
Requires-Dist: uvicorn[standard]>=0.30.0
|
|
13
|
+
Requires-Dist: pydantic>=2.0.0
|
|
14
|
+
Requires-Dist: pydantic-settings>=2.0.0
|
|
15
|
+
Requires-Dist: httpx>=0.27.0
|
|
16
|
+
Requires-Dist: sqlalchemy[asyncio]>=2.0.0
|
|
17
|
+
Requires-Dist: aiosqlite>=0.20.0
|
|
18
|
+
Requires-Dist: openpyxl>=3.1.0
|
|
19
|
+
Requires-Dist: reportlab>=4.0.0
|
|
20
|
+
Requires-Dist: psutil>=6.0.0
|
|
21
|
+
Requires-Dist: python-multipart>=0.0.9
|
|
22
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Backend package
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""API package."""
|
|
2
|
+
|
|
3
|
+
from backend.api.connections import router as connections_router
|
|
4
|
+
from backend.api.system import router as system_router
|
|
5
|
+
from backend.api.prompts import router as prompts_router
|
|
6
|
+
from backend.api.benchmarks import router as benchmarks_router
|
|
7
|
+
from backend.api.websocket import router as websocket_router
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"connections_router",
|
|
11
|
+
"system_router",
|
|
12
|
+
"prompts_router",
|
|
13
|
+
"benchmarks_router",
|
|
14
|
+
"websocket_router",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
@@ -0,0 +1,438 @@
|
|
|
1
|
+
"""API router for starting, stopping, and inspecting benchmark runs."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import json
|
|
5
|
+
from fastapi import APIRouter, Depends, Query
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
from sqlalchemy import select
|
|
8
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
|
+
|
|
10
|
+
from backend.benchmark.engine import benchmark_engine
|
|
11
|
+
from backend.database.engine import get_db_session
|
|
12
|
+
from backend.services.connection_service import ConnectionService
|
|
13
|
+
from backend.models.connection import Connection
|
|
14
|
+
from backend.models.benchmark_run import BenchmarkRun
|
|
15
|
+
from backend.models.benchmark_request import BenchmarkRequest
|
|
16
|
+
from backend.utils.errors import ValidationError
|
|
17
|
+
from backend.api.websocket import broadcast_progress_callback
|
|
18
|
+
|
|
19
|
+
router = APIRouter(prefix="/api/benchmarks", tags=["benchmarks"])
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class StartBenchmarkRequest(BaseModel):
|
|
23
|
+
"""Payload schema to initiate a new benchmark."""
|
|
24
|
+
|
|
25
|
+
connection_id: str = Field(..., description="ID of the target connection endpoint")
|
|
26
|
+
prompt_ids: list[str] | None = Field(None, description="Optional list of prompt IDs to run")
|
|
27
|
+
|
|
28
|
+
input_tokens: int = Field(128, ge=1, le=128000)
|
|
29
|
+
output_tokens: int = Field(128, ge=1, le=128000)
|
|
30
|
+
|
|
31
|
+
concurrency: int = Field(10, ge=1, le=500)
|
|
32
|
+
total_requests: int = Field(100, ge=1, le=10000)
|
|
33
|
+
warmup_requests: int = Field(0, ge=0, le=1000)
|
|
34
|
+
ignore_eos: bool = Field(False, description="Ignore EOS token to force max tokens generation")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class StartBenchmarkResponse(BaseModel):
|
|
38
|
+
"""Response returned on successful start."""
|
|
39
|
+
|
|
40
|
+
run_id: str
|
|
41
|
+
message: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class StopBenchmarkResponse(BaseModel):
|
|
45
|
+
"""Response returned on stop command."""
|
|
46
|
+
|
|
47
|
+
message: str
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class BenchmarkStatusResponse(BaseModel):
|
|
51
|
+
"""Response carrying current run status information."""
|
|
52
|
+
|
|
53
|
+
run_id: str | None
|
|
54
|
+
status: str
|
|
55
|
+
completed_requests: int
|
|
56
|
+
total_requests: int
|
|
57
|
+
failed_requests: int
|
|
58
|
+
requests_per_second: float
|
|
59
|
+
tokens_per_second: float
|
|
60
|
+
avg_latency_ms: float
|
|
61
|
+
avg_ttft_ms: float
|
|
62
|
+
avg_itl_ms: float
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@router.post("/start", response_model=StartBenchmarkResponse, status_code=202)
|
|
66
|
+
async def start_benchmark(
|
|
67
|
+
data: StartBenchmarkRequest,
|
|
68
|
+
session: AsyncSession = Depends(get_db_session),
|
|
69
|
+
) -> StartBenchmarkResponse:
|
|
70
|
+
"""Initiates a concurrent LLM endpoint benchmark run."""
|
|
71
|
+
# Validate connection exists
|
|
72
|
+
conn_service = ConnectionService(session)
|
|
73
|
+
await conn_service.get_by_id(data.connection_id)
|
|
74
|
+
|
|
75
|
+
# Format configuration values
|
|
76
|
+
config = {
|
|
77
|
+
"concurrency": data.concurrency,
|
|
78
|
+
"total_requests": data.total_requests,
|
|
79
|
+
"warmup_requests": data.warmup_requests,
|
|
80
|
+
"ignore_eos": data.ignore_eos,
|
|
81
|
+
"input_tokens": data.input_tokens,
|
|
82
|
+
"output_tokens": data.output_tokens,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# Start the benchmark engine
|
|
86
|
+
run_id = await benchmark_engine.start(
|
|
87
|
+
connection_id=data.connection_id,
|
|
88
|
+
config=config,
|
|
89
|
+
prompt_ids=data.prompt_ids or [],
|
|
90
|
+
progress_callback=broadcast_progress_callback,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return StartBenchmarkResponse(
|
|
94
|
+
run_id=run_id,
|
|
95
|
+
message="Benchmark run started successfully",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@router.post("/stop", response_model=StopBenchmarkResponse)
|
|
100
|
+
async def stop_benchmark() -> StopBenchmarkResponse:
|
|
101
|
+
"""Stops the currently running benchmark run."""
|
|
102
|
+
if not benchmark_engine.is_running():
|
|
103
|
+
raise ValidationError("No benchmark is currently running")
|
|
104
|
+
|
|
105
|
+
await benchmark_engine.stop()
|
|
106
|
+
return StopBenchmarkResponse(message="Stop signal sent to benchmark engine")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@router.get("/status", response_model=BenchmarkStatusResponse)
|
|
110
|
+
async def get_status() -> BenchmarkStatusResponse:
|
|
111
|
+
"""Retrieves the real-time status of the benchmark engine."""
|
|
112
|
+
return BenchmarkStatusResponse(
|
|
113
|
+
is_running=benchmark_engine.is_running(),
|
|
114
|
+
active_run_id=benchmark_engine.active_run_id,
|
|
115
|
+
completed_requests=benchmark_engine.completed_requests,
|
|
116
|
+
failed_requests=benchmark_engine.failed_requests,
|
|
117
|
+
current_concurrency=benchmark_engine.active_controller.concurrency
|
|
118
|
+
if benchmark_engine.active_controller
|
|
119
|
+
else 0,
|
|
120
|
+
avg_latency_ms=0.0,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# --- History and Comparison schemas ---
|
|
125
|
+
|
|
126
|
+
class BenchmarkRunSummary(BaseModel):
|
|
127
|
+
id: str
|
|
128
|
+
connection_id: str
|
|
129
|
+
connection_name: str | None = None
|
|
130
|
+
model_name: str | None = None
|
|
131
|
+
status: str
|
|
132
|
+
is_favorite: bool
|
|
133
|
+
config: dict
|
|
134
|
+
total_requests: int
|
|
135
|
+
successful_requests: int
|
|
136
|
+
failed_requests: int
|
|
137
|
+
avg_latency_ms: float
|
|
138
|
+
avg_ttft_ms: float
|
|
139
|
+
avg_itl_ms: float
|
|
140
|
+
throughput_rps: float
|
|
141
|
+
throughput_tps: float
|
|
142
|
+
success_rate: float
|
|
143
|
+
started_at: datetime
|
|
144
|
+
completed_at: datetime | None = None
|
|
145
|
+
created_at: datetime
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class BenchmarkRequestDetail(BaseModel):
|
|
149
|
+
id: str
|
|
150
|
+
request_index: int
|
|
151
|
+
prompt: str
|
|
152
|
+
response: str
|
|
153
|
+
status: str
|
|
154
|
+
status_code: int
|
|
155
|
+
error_message: str | None = None
|
|
156
|
+
latency_ms: float
|
|
157
|
+
ttft_ms: float | None = None
|
|
158
|
+
input_tokens: int
|
|
159
|
+
output_tokens: int
|
|
160
|
+
started_at: datetime
|
|
161
|
+
completed_at: datetime
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class BenchmarkRunDetail(BaseModel):
|
|
165
|
+
id: str
|
|
166
|
+
connection_id: str
|
|
167
|
+
connection_name: str | None = None
|
|
168
|
+
model_name: str | None = None
|
|
169
|
+
status: str
|
|
170
|
+
is_favorite: bool
|
|
171
|
+
config: dict
|
|
172
|
+
total_requests: int
|
|
173
|
+
successful_requests: int
|
|
174
|
+
failed_requests: int
|
|
175
|
+
avg_latency_ms: float
|
|
176
|
+
avg_ttft_ms: float
|
|
177
|
+
avg_itl_ms: float
|
|
178
|
+
throughput_rps: float
|
|
179
|
+
throughput_tps: float
|
|
180
|
+
success_rate: float
|
|
181
|
+
metrics_summary: dict | None = None
|
|
182
|
+
started_at: datetime
|
|
183
|
+
completed_at: datetime | None = None
|
|
184
|
+
created_at: datetime
|
|
185
|
+
requests: list[BenchmarkRequestDetail] = []
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# --- History and Comparison endpoints ---
|
|
189
|
+
|
|
190
|
+
@router.get("/history", response_model=list[BenchmarkRunSummary])
|
|
191
|
+
async def get_history(
|
|
192
|
+
offset: int = 0,
|
|
193
|
+
limit: int = 50,
|
|
194
|
+
session: AsyncSession = Depends(get_db_session),
|
|
195
|
+
) -> list[BenchmarkRunSummary]:
|
|
196
|
+
"""Retrieves all past benchmark executions with joined connection details."""
|
|
197
|
+
query = (
|
|
198
|
+
select(BenchmarkRun, Connection.name, Connection.model_name)
|
|
199
|
+
.outerjoin(Connection, BenchmarkRun.connection_id == Connection.id)
|
|
200
|
+
.order_by(BenchmarkRun.created_at.desc())
|
|
201
|
+
.offset(offset)
|
|
202
|
+
.limit(limit)
|
|
203
|
+
)
|
|
204
|
+
result = await session.execute(query)
|
|
205
|
+
|
|
206
|
+
summaries = []
|
|
207
|
+
for row in result.all():
|
|
208
|
+
db_run, conn_name, model_name = row
|
|
209
|
+
try:
|
|
210
|
+
cfg = json.loads(db_run.config)
|
|
211
|
+
except Exception:
|
|
212
|
+
cfg = {}
|
|
213
|
+
summaries.append(
|
|
214
|
+
BenchmarkRunSummary(
|
|
215
|
+
id=db_run.id,
|
|
216
|
+
connection_id=db_run.connection_id,
|
|
217
|
+
connection_name=conn_name,
|
|
218
|
+
model_name=model_name,
|
|
219
|
+
status=db_run.status,
|
|
220
|
+
is_favorite=db_run.is_favorite,
|
|
221
|
+
config=cfg,
|
|
222
|
+
total_requests=db_run.total_requests,
|
|
223
|
+
successful_requests=db_run.successful_requests,
|
|
224
|
+
failed_requests=db_run.failed_requests,
|
|
225
|
+
avg_latency_ms=db_run.avg_latency_ms,
|
|
226
|
+
avg_ttft_ms=db_run.avg_ttft_ms,
|
|
227
|
+
avg_itl_ms=db_run.avg_itl_ms,
|
|
228
|
+
throughput_rps=db_run.throughput_rps,
|
|
229
|
+
throughput_tps=db_run.throughput_tps,
|
|
230
|
+
success_rate=db_run.success_rate,
|
|
231
|
+
started_at=db_run.started_at,
|
|
232
|
+
completed_at=db_run.completed_at,
|
|
233
|
+
created_at=db_run.created_at,
|
|
234
|
+
)
|
|
235
|
+
)
|
|
236
|
+
return summaries
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
@router.get("/history/{run_id}", response_model=BenchmarkRunDetail)
|
|
240
|
+
async def get_run_detail(
|
|
241
|
+
run_id: str,
|
|
242
|
+
session: AsyncSession = Depends(get_db_session),
|
|
243
|
+
) -> BenchmarkRunDetail:
|
|
244
|
+
"""Fetches details of a completed benchmark run including statistics and individual request trials."""
|
|
245
|
+
query = (
|
|
246
|
+
select(BenchmarkRun, Connection.name, Connection.model_name)
|
|
247
|
+
.outerjoin(Connection, BenchmarkRun.connection_id == Connection.id)
|
|
248
|
+
.where(BenchmarkRun.id == run_id)
|
|
249
|
+
)
|
|
250
|
+
result = await session.execute(query)
|
|
251
|
+
row = result.one_or_none()
|
|
252
|
+
if not row:
|
|
253
|
+
raise ValidationError("Benchmark run not found")
|
|
254
|
+
|
|
255
|
+
db_run, conn_name, model_name = row
|
|
256
|
+
|
|
257
|
+
# Load requests
|
|
258
|
+
req_query = (
|
|
259
|
+
select(BenchmarkRequest)
|
|
260
|
+
.where(BenchmarkRequest.benchmark_run_id == run_id)
|
|
261
|
+
.order_by(BenchmarkRequest.request_index.asc())
|
|
262
|
+
)
|
|
263
|
+
req_result = await session.execute(req_query)
|
|
264
|
+
db_requests = req_result.scalars().all()
|
|
265
|
+
|
|
266
|
+
requests_detail = [
|
|
267
|
+
BenchmarkRequestDetail(
|
|
268
|
+
id=r.id,
|
|
269
|
+
request_index=r.request_index,
|
|
270
|
+
prompt=r.prompt,
|
|
271
|
+
response=r.response,
|
|
272
|
+
status=r.status,
|
|
273
|
+
status_code=r.status_code,
|
|
274
|
+
error_message=r.error_message,
|
|
275
|
+
latency_ms=r.latency_ms,
|
|
276
|
+
ttft_ms=r.ttft_ms,
|
|
277
|
+
input_tokens=r.input_tokens,
|
|
278
|
+
output_tokens=r.output_tokens,
|
|
279
|
+
started_at=r.started_at,
|
|
280
|
+
completed_at=r.completed_at,
|
|
281
|
+
)
|
|
282
|
+
for r in db_requests
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
try:
|
|
286
|
+
cfg = json.loads(db_run.config)
|
|
287
|
+
except Exception:
|
|
288
|
+
cfg = {}
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
metrics = json.loads(db_run.metrics_summary) if db_run.metrics_summary else {}
|
|
292
|
+
except Exception:
|
|
293
|
+
metrics = {}
|
|
294
|
+
|
|
295
|
+
return BenchmarkRunDetail(
|
|
296
|
+
id=db_run.id,
|
|
297
|
+
connection_id=db_run.connection_id,
|
|
298
|
+
connection_name=conn_name,
|
|
299
|
+
model_name=model_name,
|
|
300
|
+
status=db_run.status,
|
|
301
|
+
is_favorite=db_run.is_favorite,
|
|
302
|
+
config=cfg,
|
|
303
|
+
total_requests=db_run.total_requests,
|
|
304
|
+
successful_requests=db_run.successful_requests,
|
|
305
|
+
failed_requests=db_run.failed_requests,
|
|
306
|
+
avg_latency_ms=db_run.avg_latency_ms,
|
|
307
|
+
avg_ttft_ms=db_run.avg_ttft_ms,
|
|
308
|
+
avg_itl_ms=db_run.avg_itl_ms,
|
|
309
|
+
throughput_rps=db_run.throughput_rps,
|
|
310
|
+
throughput_tps=db_run.throughput_tps,
|
|
311
|
+
success_rate=db_run.success_rate,
|
|
312
|
+
metrics_summary=metrics,
|
|
313
|
+
started_at=db_run.started_at,
|
|
314
|
+
completed_at=db_run.completed_at,
|
|
315
|
+
created_at=db_run.created_at,
|
|
316
|
+
requests=requests_detail,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
@router.post("/history/{run_id}/favorite")
|
|
321
|
+
async def toggle_favorite(
|
|
322
|
+
run_id: str,
|
|
323
|
+
session: AsyncSession = Depends(get_db_session),
|
|
324
|
+
):
|
|
325
|
+
"""Toggles the favorite state of a historical benchmark run."""
|
|
326
|
+
stmt = select(BenchmarkRun).where(BenchmarkRun.id == run_id)
|
|
327
|
+
result = await session.execute(stmt)
|
|
328
|
+
db_run = result.scalar_one_or_none()
|
|
329
|
+
if not db_run:
|
|
330
|
+
raise ValidationError("Benchmark run not found")
|
|
331
|
+
|
|
332
|
+
db_run.is_favorite = not db_run.is_favorite
|
|
333
|
+
await session.commit()
|
|
334
|
+
return {"id": run_id, "is_favorite": db_run.is_favorite}
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
@router.delete("/history/{run_id}")
|
|
338
|
+
async def delete_run(
|
|
339
|
+
run_id: str,
|
|
340
|
+
session: AsyncSession = Depends(get_db_session),
|
|
341
|
+
):
|
|
342
|
+
"""Deletes a benchmark run from history database."""
|
|
343
|
+
stmt = select(BenchmarkRun).where(BenchmarkRun.id == run_id)
|
|
344
|
+
result = await session.execute(stmt)
|
|
345
|
+
db_run = result.scalar_one_or_none()
|
|
346
|
+
if not db_run:
|
|
347
|
+
raise ValidationError("Benchmark run not found")
|
|
348
|
+
|
|
349
|
+
await session.delete(db_run)
|
|
350
|
+
await session.commit()
|
|
351
|
+
return {"message": "Run deleted successfully", "id": run_id}
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
@router.get("/compare", response_model=list[BenchmarkRunDetail])
|
|
355
|
+
async def compare_runs(
|
|
356
|
+
run_ids: list[str] = Query(..., description="List of run IDs to compare side by side"),
|
|
357
|
+
session: AsyncSession = Depends(get_db_session),
|
|
358
|
+
) -> list[BenchmarkRunDetail]:
|
|
359
|
+
"""Fetches details for multiple runs side by side for comparative analysis."""
|
|
360
|
+
if not run_ids:
|
|
361
|
+
raise ValidationError("At least one run ID is required for comparison")
|
|
362
|
+
|
|
363
|
+
query = (
|
|
364
|
+
select(BenchmarkRun, Connection.name, Connection.model_name)
|
|
365
|
+
.outerjoin(Connection, BenchmarkRun.connection_id == Connection.id)
|
|
366
|
+
.where(BenchmarkRun.id.in_(run_ids))
|
|
367
|
+
)
|
|
368
|
+
result = await session.execute(query)
|
|
369
|
+
rows = result.all()
|
|
370
|
+
|
|
371
|
+
comparisons = []
|
|
372
|
+
for row in rows:
|
|
373
|
+
db_run, conn_name, model_name = row
|
|
374
|
+
|
|
375
|
+
# Load requests for each compared run
|
|
376
|
+
req_query = (
|
|
377
|
+
select(BenchmarkRequest)
|
|
378
|
+
.where(BenchmarkRequest.benchmark_run_id == db_run.id)
|
|
379
|
+
.order_by(BenchmarkRequest.request_index.asc())
|
|
380
|
+
)
|
|
381
|
+
req_result = await session.execute(req_query)
|
|
382
|
+
db_requests = req_result.scalars().all()
|
|
383
|
+
|
|
384
|
+
requests_detail = [
|
|
385
|
+
BenchmarkRequestDetail(
|
|
386
|
+
id=r.id,
|
|
387
|
+
request_index=r.request_index,
|
|
388
|
+
prompt=r.prompt,
|
|
389
|
+
response=r.response,
|
|
390
|
+
status=r.status,
|
|
391
|
+
status_code=r.status_code,
|
|
392
|
+
error_message=r.error_message,
|
|
393
|
+
latency_ms=r.latency_ms,
|
|
394
|
+
ttft_ms=r.ttft_ms,
|
|
395
|
+
input_tokens=r.input_tokens,
|
|
396
|
+
output_tokens=r.output_tokens,
|
|
397
|
+
started_at=r.started_at,
|
|
398
|
+
completed_at=r.completed_at,
|
|
399
|
+
)
|
|
400
|
+
for r in db_requests
|
|
401
|
+
]
|
|
402
|
+
|
|
403
|
+
try:
|
|
404
|
+
cfg = json.loads(db_run.config)
|
|
405
|
+
except Exception:
|
|
406
|
+
cfg = {}
|
|
407
|
+
|
|
408
|
+
try:
|
|
409
|
+
metrics = json.loads(db_run.metrics_summary) if db_run.metrics_summary else {}
|
|
410
|
+
except Exception:
|
|
411
|
+
metrics = {}
|
|
412
|
+
|
|
413
|
+
comparisons.append(
|
|
414
|
+
BenchmarkRunDetail(
|
|
415
|
+
id=db_run.id,
|
|
416
|
+
connection_id=db_run.connection_id,
|
|
417
|
+
connection_name=conn_name,
|
|
418
|
+
model_name=model_name,
|
|
419
|
+
status=db_run.status,
|
|
420
|
+
is_favorite=db_run.is_favorite,
|
|
421
|
+
config=cfg,
|
|
422
|
+
total_requests=db_run.total_requests,
|
|
423
|
+
successful_requests=db_run.successful_requests,
|
|
424
|
+
failed_requests=db_run.failed_requests,
|
|
425
|
+
avg_latency_ms=db_run.avg_latency_ms,
|
|
426
|
+
avg_ttft_ms=db_run.avg_ttft_ms,
|
|
427
|
+
avg_itl_ms=db_run.avg_itl_ms,
|
|
428
|
+
throughput_rps=db_run.throughput_rps,
|
|
429
|
+
throughput_tps=db_run.throughput_tps,
|
|
430
|
+
success_rate=db_run.success_rate,
|
|
431
|
+
metrics_summary=metrics,
|
|
432
|
+
started_at=db_run.started_at,
|
|
433
|
+
completed_at=db_run.completed_at,
|
|
434
|
+
created_at=db_run.created_at,
|
|
435
|
+
requests=requests_detail,
|
|
436
|
+
)
|
|
437
|
+
)
|
|
438
|
+
return comparisons
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Connection management API endpoints."""
|
|
2
|
+
|
|
3
|
+
from fastapi import APIRouter, Depends, Query
|
|
4
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
|
+
|
|
6
|
+
from backend.database.engine import get_db_session
|
|
7
|
+
from backend.services.connection_service import (
|
|
8
|
+
ConnectionCreate,
|
|
9
|
+
ConnectionResponse,
|
|
10
|
+
ConnectionService,
|
|
11
|
+
ConnectionTestResult,
|
|
12
|
+
ConnectionUpdate,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
router = APIRouter(prefix="/api/connections", tags=["connections"])
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_service(session: AsyncSession = Depends(get_db_session)) -> ConnectionService:
|
|
19
|
+
return ConnectionService(session)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@router.post("", response_model=ConnectionResponse, status_code=201)
|
|
23
|
+
async def create_connection(
|
|
24
|
+
data: ConnectionCreate,
|
|
25
|
+
service: ConnectionService = Depends(_get_service),
|
|
26
|
+
) -> ConnectionResponse:
|
|
27
|
+
"""Create a new LLM endpoint connection."""
|
|
28
|
+
return await service.create(data)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@router.get("", response_model=list[ConnectionResponse])
|
|
32
|
+
async def list_connections(
|
|
33
|
+
offset: int = Query(0, ge=0),
|
|
34
|
+
limit: int = Query(100, ge=1, le=500),
|
|
35
|
+
service: ConnectionService = Depends(_get_service),
|
|
36
|
+
) -> list[ConnectionResponse]:
|
|
37
|
+
"""List all saved connections with pagination."""
|
|
38
|
+
return await service.get_all(offset=offset, limit=limit)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@router.get("/{connection_id}", response_model=ConnectionResponse)
|
|
42
|
+
async def get_connection(
|
|
43
|
+
connection_id: str,
|
|
44
|
+
service: ConnectionService = Depends(_get_service),
|
|
45
|
+
) -> ConnectionResponse:
|
|
46
|
+
"""Get a specific connection by ID."""
|
|
47
|
+
return await service.get_by_id(connection_id)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@router.put("/{connection_id}", response_model=ConnectionResponse)
|
|
51
|
+
async def update_connection(
|
|
52
|
+
connection_id: str,
|
|
53
|
+
data: ConnectionUpdate,
|
|
54
|
+
service: ConnectionService = Depends(_get_service),
|
|
55
|
+
) -> ConnectionResponse:
|
|
56
|
+
"""Update an existing connection."""
|
|
57
|
+
return await service.update(connection_id, data)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@router.delete("/{connection_id}", status_code=204)
|
|
61
|
+
async def delete_connection(
|
|
62
|
+
connection_id: str,
|
|
63
|
+
service: ConnectionService = Depends(_get_service),
|
|
64
|
+
) -> None:
|
|
65
|
+
"""Delete a connection."""
|
|
66
|
+
await service.delete(connection_id)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@router.post(
|
|
70
|
+
"/{connection_id}/test",
|
|
71
|
+
response_model=ConnectionTestResult,
|
|
72
|
+
)
|
|
73
|
+
async def test_connection(
|
|
74
|
+
connection_id: str,
|
|
75
|
+
service: ConnectionService = Depends(_get_service),
|
|
76
|
+
) -> ConnectionTestResult:
|
|
77
|
+
"""Test a saved connection by sending a minimal request."""
|
|
78
|
+
return await service.test_connection(connection_id)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@router.post("/test", response_model=ConnectionTestResult)
|
|
82
|
+
async def test_connection_config(
|
|
83
|
+
data: ConnectionCreate,
|
|
84
|
+
service: ConnectionService = Depends(_get_service),
|
|
85
|
+
) -> ConnectionTestResult:
|
|
86
|
+
"""Test a connection from raw config without saving."""
|
|
87
|
+
return await service.test_connection_config(data)
|