mcp-plesk-dev-docs 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_plesk_dev_docs-0.4.2.dist-info/METADATA +221 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/RECORD +30 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/WHEEL +5 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/entry_points.txt +2 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/licenses/LICENSE +21 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/licenses/NOTICE +0 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/top_level.txt +1 -0
- plesk_unified/__init__.py +3 -0
- plesk_unified/ai_client.py +257 -0
- plesk_unified/benchmark_engines.py +330 -0
- plesk_unified/benchmark_gates.py +254 -0
- plesk_unified/benchmark_reporting.py +107 -0
- plesk_unified/benchmark_runner.py +433 -0
- plesk_unified/benchmark_suites.py +30 -0
- plesk_unified/chunking.py +360 -0
- plesk_unified/error_handling.py +112 -0
- plesk_unified/html_utils.py +217 -0
- plesk_unified/indexing.py +53 -0
- plesk_unified/io_utils.py +287 -0
- plesk_unified/log_handler.py +209 -0
- plesk_unified/model_config.py +218 -0
- plesk_unified/platform_utils.py +214 -0
- plesk_unified/settings.py +93 -0
- plesk_unified/summary_cache.py +55 -0
- plesk_unified/tq_index.py +85 -0
- plesk_unified/turboquant/__init__.py +21 -0
- plesk_unified/turboquant/compressors.py +190 -0
- plesk_unified/turboquant/lloyd_max.py +190 -0
- plesk_unified/turboquant/turboquant.py +249 -0
- plesk_unified/types.py +27 -0
|
@@ -0,0 +1,433 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Core benchmark runner — executes retrieval benchmarks across model profiles.
|
|
4
|
+
|
|
5
|
+
This module depends on internal package APIs (AppContainer, SearchService, etc.)
|
|
6
|
+
and is the workhorse behind ``scripts/benchmark_profiles.py``.
|
|
7
|
+
|
|
8
|
+
Usage
|
|
9
|
+
-----
|
|
10
|
+
from plesk_unified.benchmark_runner import run_benchmark
|
|
11
|
+
|
|
12
|
+
results = run_benchmark(queries, profile_name="light")
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import gc
|
|
19
|
+
import os
|
|
20
|
+
import time
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
23
|
+
from unittest.mock import AsyncMock
|
|
24
|
+
|
|
25
|
+
import numpy as np
|
|
26
|
+
|
|
27
|
+
from plesk_unified.ai_client import AIClient
|
|
28
|
+
from plesk_unified.benchmark_engines import (
|
|
29
|
+
DEFAULT_PILOT_CONFIG,
|
|
30
|
+
StructurePilotConfig,
|
|
31
|
+
bucket_query,
|
|
32
|
+
rerank_with_structure,
|
|
33
|
+
route_query,
|
|
34
|
+
)
|
|
35
|
+
from plesk_unified.platform_utils import get_optimal_device
|
|
36
|
+
from plesk_unified.tq_index import TurboQuantIndex
|
|
37
|
+
from plesk_unified.types import CategoryEnum
|
|
38
|
+
|
|
39
|
+
if TYPE_CHECKING:
|
|
40
|
+
from plesk_unified.server.mcp_app import AppContainer
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
# Helpers
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _rss_mb() -> float:
|
|
49
|
+
"""Return current process RSS in MB (cross-platform best-effort)."""
|
|
50
|
+
try:
|
|
51
|
+
import psutil # type: ignore
|
|
52
|
+
|
|
53
|
+
return psutil.Process().memory_info().rss / 1_048_576
|
|
54
|
+
except ImportError:
|
|
55
|
+
# Fallback: /proc/self/status on Linux
|
|
56
|
+
try:
|
|
57
|
+
status = Path("/proc/self/status").read_text()
|
|
58
|
+
for line in status.splitlines():
|
|
59
|
+
if line.startswith("VmRSS:"):
|
|
60
|
+
return int(line.split()[1]) / 1024
|
|
61
|
+
except Exception:
|
|
62
|
+
pass
|
|
63
|
+
return 0.0
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _hit_rank(results: list[dict[str, Any]], relevant: list[str]) -> int | None:
|
|
67
|
+
"""Return 1-based rank of first hit, or None if no hit in top-k.
|
|
68
|
+
|
|
69
|
+
Accepts list of dicts (SearchResult or pageindex-pilot result).
|
|
70
|
+
"""
|
|
71
|
+
for rank, result in enumerate(results, start=1):
|
|
72
|
+
text_to_check = result.get("text", "")
|
|
73
|
+
lower = text_to_check.lower()
|
|
74
|
+
if any(kw.lower() in lower for kw in relevant):
|
|
75
|
+
return rank
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
# RAGAS (LLM-as-judge) evaluation
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def evaluate_ragas_metrics( # noqa: PLR0913
|
|
85
|
+
query: str,
|
|
86
|
+
answer: str,
|
|
87
|
+
retrieved_context: str,
|
|
88
|
+
ground_truth: str | None,
|
|
89
|
+
reference_context: str | None,
|
|
90
|
+
ai_client: AIClient,
|
|
91
|
+
model: str | None = None,
|
|
92
|
+
) -> dict[str, float]:
|
|
93
|
+
"""
|
|
94
|
+
Evaluate faithfulness, context recall, and context precision using an LLM judge.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
query: The user's search query.
|
|
99
|
+
answer: The generated answer to evaluate.
|
|
100
|
+
retrieved_context: Concatenated retrieved chunks.
|
|
101
|
+
ground_truth: Optional ideal answer (currently unused in scoring).
|
|
102
|
+
reference_context: Optional reference context for recall evaluation.
|
|
103
|
+
ai_client: An AIClient instance for LLM-based scoring.
|
|
104
|
+
model: Optional model override for the judge LLM.
|
|
105
|
+
|
|
106
|
+
Returns
|
|
107
|
+
-------
|
|
108
|
+
dict with keys ``faithfulness``, ``context_recall``, ``context_precision``.
|
|
109
|
+
"""
|
|
110
|
+
model_list = [model] if model else None
|
|
111
|
+
metrics = {}
|
|
112
|
+
|
|
113
|
+
# 1. Faithfulness: Is the answer grounded in the retrieved context?
|
|
114
|
+
prompt_f = (
|
|
115
|
+
f"RETRIEVED CONTEXT:\n{retrieved_context}\n\n"
|
|
116
|
+
f"ANSWER:\n{answer}\n\n"
|
|
117
|
+
"Does the answer only use facts from the context? Score 0.0–1.0."
|
|
118
|
+
)
|
|
119
|
+
metrics["faithfulness"] = ai_client.evaluate_ragas_score(prompt_f, model_list)
|
|
120
|
+
|
|
121
|
+
# 2. Context Recall: Did we retrieve the reference context?
|
|
122
|
+
if reference_context:
|
|
123
|
+
prompt_r = (
|
|
124
|
+
f"REFERENCE CONTEXT:\n{reference_context}\n\n"
|
|
125
|
+
f"RETRIEVED CONTEXT:\n{retrieved_context}\n\n"
|
|
126
|
+
"Was the retrieved context able to recall key facts from the reference? "
|
|
127
|
+
"Score 0.0–1.0."
|
|
128
|
+
)
|
|
129
|
+
metrics["context_recall"] = ai_client.evaluate_ragas_score(prompt_r, model_list)
|
|
130
|
+
|
|
131
|
+
# 3. Context Precision: Are the retrieved chunks relevant to the query?
|
|
132
|
+
prompt_p = (
|
|
133
|
+
f"QUERY:\n{query}\n\n"
|
|
134
|
+
f"RETRIEVED CONTEXT:\n{retrieved_context}\n\n"
|
|
135
|
+
"Are the retrieved chunks relevant to answering the query? Score 0.0–1.0."
|
|
136
|
+
)
|
|
137
|
+
metrics["context_precision"] = ai_client.evaluate_ragas_score(prompt_p, model_list)
|
|
138
|
+
|
|
139
|
+
return metrics
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _add_ragas_summary(
|
|
143
|
+
res: dict[str, Any], ragas_metrics: list[dict[str, float]]
|
|
144
|
+
) -> None:
|
|
145
|
+
"""Calculate and add aggregated RAGAS metrics to *res*."""
|
|
146
|
+
evaluated = [m for m in ragas_metrics if m]
|
|
147
|
+
if evaluated:
|
|
148
|
+
res["faithfulness"] = sum(m.get("faithfulness", 0.0) for m in evaluated) / len(
|
|
149
|
+
evaluated
|
|
150
|
+
)
|
|
151
|
+
res["context_recall"] = sum(
|
|
152
|
+
m.get("context_recall", 0.0) for m in evaluated
|
|
153
|
+
) / len(evaluated)
|
|
154
|
+
res["context_precision"] = sum(
|
|
155
|
+
m.get("context_precision", 0.0) for m in evaluated
|
|
156
|
+
) / len(evaluated)
|
|
157
|
+
res["ragas_n_evaluated"] = len(evaluated)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# ---------------------------------------------------------------------------
|
|
161
|
+
# Container bootstrap
|
|
162
|
+
# ---------------------------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _load_container_for_profile(profile_name: str) -> AppContainer:
|
|
166
|
+
"""Load an AppContainer instance configured for the given profile."""
|
|
167
|
+
from plesk_unified.settings import settings as global_settings
|
|
168
|
+
from plesk_unified.server.bootstrap import create_app
|
|
169
|
+
|
|
170
|
+
# Force the global settings singleton to use the requested profile.
|
|
171
|
+
global_settings.plesk_model_profile = profile_name
|
|
172
|
+
container = create_app(Path(os.getcwd()), global_settings)
|
|
173
|
+
return container
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# ---------------------------------------------------------------------------
|
|
177
|
+
# Engine / retrieval
|
|
178
|
+
# ---------------------------------------------------------------------------
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _get_selected_engine(
|
|
182
|
+
query_str: str,
|
|
183
|
+
bucket: str,
|
|
184
|
+
routing_policy: str,
|
|
185
|
+
engine_name: str,
|
|
186
|
+
pilot_config: StructurePilotConfig | None,
|
|
187
|
+
) -> tuple[str, StructurePilotConfig | None, str]:
|
|
188
|
+
"""Determine the engine and pilot config based on the routing policy."""
|
|
189
|
+
if routing_policy and routing_policy != "baseline-only":
|
|
190
|
+
decision = route_query(query_str, bucket, routing_policy=routing_policy)
|
|
191
|
+
return decision.engine, decision.pilot_config, decision.reason
|
|
192
|
+
return engine_name, pilot_config, "manual-engine"
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _perform_retrieval( # noqa: PLR0913
|
|
196
|
+
container: AppContainer,
|
|
197
|
+
query_str: str,
|
|
198
|
+
category: CategoryEnum | None,
|
|
199
|
+
candidate_limit: int,
|
|
200
|
+
final_k: int,
|
|
201
|
+
selected_engine: str,
|
|
202
|
+
selected_pilot_config: StructurePilotConfig | None,
|
|
203
|
+
) -> list[dict[str, Any]]:
|
|
204
|
+
"""Execute search and reranking steps using SearchService."""
|
|
205
|
+
initial_results: list[dict[str, Any]] = asyncio.run(
|
|
206
|
+
container.search_service.search_raw(
|
|
207
|
+
query=query_str,
|
|
208
|
+
category=category.value if category else None,
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# PageIndex pilot is applied AFTER initial search and reranking.
|
|
213
|
+
if selected_engine == "pageindex-pilot" and initial_results:
|
|
214
|
+
reranked_dicts = rerank_with_structure(
|
|
215
|
+
query_str,
|
|
216
|
+
initial_results,
|
|
217
|
+
config=selected_pilot_config or DEFAULT_PILOT_CONFIG,
|
|
218
|
+
)[:final_k]
|
|
219
|
+
return reranked_dicts
|
|
220
|
+
|
|
221
|
+
return initial_results[:final_k]
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
# ---------------------------------------------------------------------------
|
|
225
|
+
# TurboQuant
|
|
226
|
+
# ---------------------------------------------------------------------------
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _init_tq_index(container: AppContainer) -> TurboQuantIndex:
|
|
230
|
+
"""Initialise TQ index for the full-tq profile using AppContainer services."""
|
|
231
|
+
tq_bits = int(os.getenv("TQ_BITS", "4"))
|
|
232
|
+
tq_index = TurboQuantIndex(
|
|
233
|
+
dim=container.settings.embedding_model_dimensions,
|
|
234
|
+
bits=tq_bits,
|
|
235
|
+
device=get_optimal_device(),
|
|
236
|
+
)
|
|
237
|
+
all_docs = container.lancedb_repo.get_table().search().limit(100000).to_list()
|
|
238
|
+
if all_docs:
|
|
239
|
+
corpus_vecs = np.array([doc["vector"] for doc in all_docs], dtype=np.float32)
|
|
240
|
+
tq_index.add(corpus_vecs, all_docs)
|
|
241
|
+
return tq_index
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
# ---------------------------------------------------------------------------
|
|
245
|
+
# Main benchmark entry point
|
|
246
|
+
# ---------------------------------------------------------------------------
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def run_benchmark( # noqa: PLR0913, PLR0915
|
|
250
|
+
queries: list[dict],
|
|
251
|
+
profile_name: str,
|
|
252
|
+
top_k: int = 10,
|
|
253
|
+
final_k: int = 5,
|
|
254
|
+
refresh: bool = False,
|
|
255
|
+
engine_name: str = "baseline",
|
|
256
|
+
pilot_config: StructurePilotConfig | None = None,
|
|
257
|
+
routing_policy: str = "baseline-only",
|
|
258
|
+
ragas: bool = False,
|
|
259
|
+
ragas_model: str | None = None,
|
|
260
|
+
) -> dict[str, Any]:
|
|
261
|
+
"""
|
|
262
|
+
Run the full query set against the requested profile.
|
|
263
|
+
|
|
264
|
+
Parameters
|
|
265
|
+
----------
|
|
266
|
+
queries: List of query dicts with ``query``, ``relevant``, optional ``category``.
|
|
267
|
+
profile_name: Model profile name (e.g. ``"light"``, ``"medium"``).
|
|
268
|
+
top_k: ANN candidates before reranking.
|
|
269
|
+
final_k: Number of final results returned.
|
|
270
|
+
refresh: Whether to refresh the knowledge base first.
|
|
271
|
+
engine_name: ``"baseline"`` or ``"pageindex-pilot"``.
|
|
272
|
+
pilot_config: Structure pilot configuration (or ``None`` for default).
|
|
273
|
+
routing_policy: Per-query routing policy.
|
|
274
|
+
ragas: Whether to compute RAGAS metrics.
|
|
275
|
+
ragas_model: Optional model override for the RAGAS judge.
|
|
276
|
+
|
|
277
|
+
Returns
|
|
278
|
+
-------
|
|
279
|
+
dict with keys ``profile``, ``hit_rate``, ``mrr``, ``avg_latency_s``,
|
|
280
|
+
``model_rss_mb``, ``per_query``, ``bucket_metrics``, etc.
|
|
281
|
+
"""
|
|
282
|
+
container = _load_container_for_profile(profile_name)
|
|
283
|
+
rss_before = _rss_mb()
|
|
284
|
+
|
|
285
|
+
# Force model initialisation through container services
|
|
286
|
+
_ = container.model_runtime.get_embedding_model()
|
|
287
|
+
_ = container.model_runtime.get_reranker()
|
|
288
|
+
|
|
289
|
+
if refresh:
|
|
290
|
+
print(f" Refreshing knowledge base for profile '{profile_name}'...")
|
|
291
|
+
dummy_ctx = AsyncMock()
|
|
292
|
+
report = asyncio.run(
|
|
293
|
+
container.indexing_service.refresh_knowledge(
|
|
294
|
+
progress_callback=dummy_ctx.report_progress,
|
|
295
|
+
category="all",
|
|
296
|
+
reset_db=True,
|
|
297
|
+
)
|
|
298
|
+
)
|
|
299
|
+
print(f" Refresh complete:\n{report}")
|
|
300
|
+
|
|
301
|
+
rss_after = _rss_mb()
|
|
302
|
+
model_rss = rss_after - rss_before
|
|
303
|
+
|
|
304
|
+
if container.model_runtime.get_profile().use_turboquant:
|
|
305
|
+
_init_tq_index(container)
|
|
306
|
+
|
|
307
|
+
ai_client = AIClient() if ragas else None
|
|
308
|
+
|
|
309
|
+
hits, reciprocal_ranks, latencies = [], [], []
|
|
310
|
+
query_meta: list[dict[str, Any]] = []
|
|
311
|
+
bucket_metrics_raw: dict[str, dict[str, list]] = {}
|
|
312
|
+
ragas_metrics: list[dict[str, float]] = []
|
|
313
|
+
|
|
314
|
+
for q in queries:
|
|
315
|
+
t0 = time.perf_counter()
|
|
316
|
+
bucket = q.get("bucket") or bucket_query(q["query"])
|
|
317
|
+
category_str = q.get("category")
|
|
318
|
+
category: Optional[CategoryEnum] = None
|
|
319
|
+
if category_str and category_str != "mixed":
|
|
320
|
+
category = CategoryEnum(category_str)
|
|
321
|
+
|
|
322
|
+
bm = bucket_metrics_raw.setdefault(
|
|
323
|
+
bucket, {"hits": [], "rrs": [], "latencies": []}
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
sel_engine, sel_pilot, reason = _get_selected_engine(
|
|
327
|
+
q["query"], bucket, routing_policy, engine_name, pilot_config
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
final_search_results = _perform_retrieval(
|
|
331
|
+
container,
|
|
332
|
+
q["query"],
|
|
333
|
+
category,
|
|
334
|
+
top_k,
|
|
335
|
+
final_k,
|
|
336
|
+
sel_engine,
|
|
337
|
+
sel_pilot,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
latency = time.perf_counter() - t0
|
|
341
|
+
rank = _hit_rank(final_search_results, q["relevant"])
|
|
342
|
+
|
|
343
|
+
hit_val = 1 if rank is not None else 0
|
|
344
|
+
rr_val = 1 / rank if rank is not None else 0.0
|
|
345
|
+
hits.append(hit_val)
|
|
346
|
+
reciprocal_ranks.append(rr_val)
|
|
347
|
+
latencies.append(latency)
|
|
348
|
+
bm["hits"].append(hit_val)
|
|
349
|
+
bm["rrs"].append(rr_val)
|
|
350
|
+
bm["latencies"].append(latency)
|
|
351
|
+
|
|
352
|
+
current_ragas = {}
|
|
353
|
+
if ragas and ai_client:
|
|
354
|
+
retrieved_context = "\n".join(
|
|
355
|
+
r.get("text", "") for r in final_search_results
|
|
356
|
+
)
|
|
357
|
+
gen_answer = ai_client.generate_answer(q["query"], retrieved_context)
|
|
358
|
+
current_ragas = evaluate_ragas_metrics(
|
|
359
|
+
q["query"],
|
|
360
|
+
gen_answer,
|
|
361
|
+
retrieved_context,
|
|
362
|
+
q.get("ground_truth"),
|
|
363
|
+
q.get("reference_context"),
|
|
364
|
+
ai_client,
|
|
365
|
+
ragas_model,
|
|
366
|
+
)
|
|
367
|
+
ragas_metrics.append(current_ragas)
|
|
368
|
+
|
|
369
|
+
top_score = (
|
|
370
|
+
final_search_results[0].get("_relevance", 0.0)
|
|
371
|
+
if final_search_results
|
|
372
|
+
else 0.0
|
|
373
|
+
)
|
|
374
|
+
query_meta.append(
|
|
375
|
+
{
|
|
376
|
+
"query": q["query"],
|
|
377
|
+
"hit": rank is not None,
|
|
378
|
+
"rr": rr_val,
|
|
379
|
+
"score": top_score,
|
|
380
|
+
"latency_s": latency,
|
|
381
|
+
"bucket": bucket,
|
|
382
|
+
"selected_engine": sel_engine,
|
|
383
|
+
"selected_pilot_config": sel_pilot.name if sel_pilot else "base",
|
|
384
|
+
"routing_reason": reason,
|
|
385
|
+
}
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
n = len(queries)
|
|
389
|
+
res: dict[str, Any] = {
|
|
390
|
+
"profile": profile_name,
|
|
391
|
+
"n_queries": n,
|
|
392
|
+
"hit_rate": sum(hits) / n if n else 0.0,
|
|
393
|
+
"mrr": sum(reciprocal_ranks) / n if n else 0.0,
|
|
394
|
+
"avg_latency_s": sum(latencies) / n if n else 0.0,
|
|
395
|
+
"model_rss_mb": model_rss,
|
|
396
|
+
"engine": engine_name,
|
|
397
|
+
"pilot_config": pilot_config.name if pilot_config else None,
|
|
398
|
+
"routing_policy": routing_policy,
|
|
399
|
+
"bucket_metrics": {
|
|
400
|
+
name: {
|
|
401
|
+
"n": len(m["hits"]),
|
|
402
|
+
"hit_rate": sum(m["hits"]) / len(m["hits"]) if m["hits"] else 0.0,
|
|
403
|
+
"mrr": sum(m["rrs"]) / len(m["rrs"]) if m["rrs"] else 0.0,
|
|
404
|
+
"avg_latency_s": (
|
|
405
|
+
sum(m["latencies"]) / len(m["latencies"]) if m["latencies"] else 0.0
|
|
406
|
+
),
|
|
407
|
+
}
|
|
408
|
+
for name, m in bucket_metrics_raw.items()
|
|
409
|
+
},
|
|
410
|
+
"per_query": [
|
|
411
|
+
{
|
|
412
|
+
"query": q["query"],
|
|
413
|
+
"hit": bool(hits[i]),
|
|
414
|
+
"rr": reciprocal_ranks[i],
|
|
415
|
+
"latency_s": latencies[i],
|
|
416
|
+
"bucket": query_meta[i]["bucket"],
|
|
417
|
+
"selected_engine": query_meta[i]["selected_engine"],
|
|
418
|
+
"selected_pilot_config": query_meta[i]["selected_pilot_config"],
|
|
419
|
+
"routing_reason": query_meta[i]["routing_reason"],
|
|
420
|
+
**ragas_metrics[i],
|
|
421
|
+
}
|
|
422
|
+
for i, q in enumerate(queries)
|
|
423
|
+
],
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
if ragas:
|
|
427
|
+
_add_ragas_summary(res, ragas_metrics)
|
|
428
|
+
|
|
429
|
+
# Clean up resources to prevent context leaks and memory accumulation
|
|
430
|
+
container.shutdown()
|
|
431
|
+
gc.collect()
|
|
432
|
+
|
|
433
|
+
return res
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
SUITES_DIR = Path(__file__).parent.parent / "benchmarks" / "suites"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def load_suite(name: str) -> list[dict]:
|
|
10
|
+
"""Load a benchmark suite from its corresponding JSON file."""
|
|
11
|
+
path = SUITES_DIR / f"{name}.json"
|
|
12
|
+
if not path.exists():
|
|
13
|
+
# Fallback to an empty list if suite not found, or raise error if mandatory
|
|
14
|
+
return []
|
|
15
|
+
with path.open("r", encoding="utf-8") as f:
|
|
16
|
+
return json.load(f)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Load built-in suites from JSON definitions
|
|
20
|
+
BUILTIN_QUERIES = load_suite("control")
|
|
21
|
+
STRUCTURAL_QUERIES = load_suite("structural")
|
|
22
|
+
LONG_DOC_QUERIES = load_suite("long-doc")
|
|
23
|
+
MULTI_HOP_QUERIES = load_suite("multi-hop")
|
|
24
|
+
|
|
25
|
+
BENCHMARK_SUITES: dict[str, list[dict]] = {
|
|
26
|
+
"control": BUILTIN_QUERIES,
|
|
27
|
+
"structural": STRUCTURAL_QUERIES,
|
|
28
|
+
"long-doc": LONG_DOC_QUERIES,
|
|
29
|
+
"multi-hop": MULTI_HOP_QUERIES,
|
|
30
|
+
}
|