mcp-plesk-dev-docs 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,433 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Core benchmark runner — executes retrieval benchmarks across model profiles.
4
+
5
+ This module depends on internal package APIs (AppContainer, SearchService, etc.)
6
+ and is the workhorse behind ``scripts/benchmark_profiles.py``.
7
+
8
+ Usage
9
+ -----
10
+ from plesk_unified.benchmark_runner import run_benchmark
11
+
12
+ results = run_benchmark(queries, profile_name="light")
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import gc
19
+ import os
20
+ import time
21
+ from pathlib import Path
22
+ from typing import TYPE_CHECKING, Any, Optional
23
+ from unittest.mock import AsyncMock
24
+
25
+ import numpy as np
26
+
27
+ from plesk_unified.ai_client import AIClient
28
+ from plesk_unified.benchmark_engines import (
29
+ DEFAULT_PILOT_CONFIG,
30
+ StructurePilotConfig,
31
+ bucket_query,
32
+ rerank_with_structure,
33
+ route_query,
34
+ )
35
+ from plesk_unified.platform_utils import get_optimal_device
36
+ from plesk_unified.tq_index import TurboQuantIndex
37
+ from plesk_unified.types import CategoryEnum
38
+
39
+ if TYPE_CHECKING:
40
+ from plesk_unified.server.mcp_app import AppContainer
41
+
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Helpers
45
+ # ---------------------------------------------------------------------------
46
+
47
+
48
+ def _rss_mb() -> float:
49
+ """Return current process RSS in MB (cross-platform best-effort)."""
50
+ try:
51
+ import psutil # type: ignore
52
+
53
+ return psutil.Process().memory_info().rss / 1_048_576
54
+ except ImportError:
55
+ # Fallback: /proc/self/status on Linux
56
+ try:
57
+ status = Path("/proc/self/status").read_text()
58
+ for line in status.splitlines():
59
+ if line.startswith("VmRSS:"):
60
+ return int(line.split()[1]) / 1024
61
+ except Exception:
62
+ pass
63
+ return 0.0
64
+
65
+
66
+ def _hit_rank(results: list[dict[str, Any]], relevant: list[str]) -> int | None:
67
+ """Return 1-based rank of first hit, or None if no hit in top-k.
68
+
69
+ Accepts list of dicts (SearchResult or pageindex-pilot result).
70
+ """
71
+ for rank, result in enumerate(results, start=1):
72
+ text_to_check = result.get("text", "")
73
+ lower = text_to_check.lower()
74
+ if any(kw.lower() in lower for kw in relevant):
75
+ return rank
76
+ return None
77
+
78
+
79
+ # ---------------------------------------------------------------------------
80
+ # RAGAS (LLM-as-judge) evaluation
81
+ # ---------------------------------------------------------------------------
82
+
83
+
84
+ def evaluate_ragas_metrics( # noqa: PLR0913
85
+ query: str,
86
+ answer: str,
87
+ retrieved_context: str,
88
+ ground_truth: str | None,
89
+ reference_context: str | None,
90
+ ai_client: AIClient,
91
+ model: str | None = None,
92
+ ) -> dict[str, float]:
93
+ """
94
+ Evaluate faithfulness, context recall, and context precision using an LLM judge.
95
+
96
+ Parameters
97
+ ----------
98
+ query: The user's search query.
99
+ answer: The generated answer to evaluate.
100
+ retrieved_context: Concatenated retrieved chunks.
101
+ ground_truth: Optional ideal answer (currently unused in scoring).
102
+ reference_context: Optional reference context for recall evaluation.
103
+ ai_client: An AIClient instance for LLM-based scoring.
104
+ model: Optional model override for the judge LLM.
105
+
106
+ Returns
107
+ -------
108
+ dict with keys ``faithfulness``, ``context_recall``, ``context_precision``.
109
+ """
110
+ model_list = [model] if model else None
111
+ metrics = {}
112
+
113
+ # 1. Faithfulness: Is the answer grounded in the retrieved context?
114
+ prompt_f = (
115
+ f"RETRIEVED CONTEXT:\n{retrieved_context}\n\n"
116
+ f"ANSWER:\n{answer}\n\n"
117
+ "Does the answer only use facts from the context? Score 0.0–1.0."
118
+ )
119
+ metrics["faithfulness"] = ai_client.evaluate_ragas_score(prompt_f, model_list)
120
+
121
+ # 2. Context Recall: Did we retrieve the reference context?
122
+ if reference_context:
123
+ prompt_r = (
124
+ f"REFERENCE CONTEXT:\n{reference_context}\n\n"
125
+ f"RETRIEVED CONTEXT:\n{retrieved_context}\n\n"
126
+ "Was the retrieved context able to recall key facts from the reference? "
127
+ "Score 0.0–1.0."
128
+ )
129
+ metrics["context_recall"] = ai_client.evaluate_ragas_score(prompt_r, model_list)
130
+
131
+ # 3. Context Precision: Are the retrieved chunks relevant to the query?
132
+ prompt_p = (
133
+ f"QUERY:\n{query}\n\n"
134
+ f"RETRIEVED CONTEXT:\n{retrieved_context}\n\n"
135
+ "Are the retrieved chunks relevant to answering the query? Score 0.0–1.0."
136
+ )
137
+ metrics["context_precision"] = ai_client.evaluate_ragas_score(prompt_p, model_list)
138
+
139
+ return metrics
140
+
141
+
142
+ def _add_ragas_summary(
143
+ res: dict[str, Any], ragas_metrics: list[dict[str, float]]
144
+ ) -> None:
145
+ """Calculate and add aggregated RAGAS metrics to *res*."""
146
+ evaluated = [m for m in ragas_metrics if m]
147
+ if evaluated:
148
+ res["faithfulness"] = sum(m.get("faithfulness", 0.0) for m in evaluated) / len(
149
+ evaluated
150
+ )
151
+ res["context_recall"] = sum(
152
+ m.get("context_recall", 0.0) for m in evaluated
153
+ ) / len(evaluated)
154
+ res["context_precision"] = sum(
155
+ m.get("context_precision", 0.0) for m in evaluated
156
+ ) / len(evaluated)
157
+ res["ragas_n_evaluated"] = len(evaluated)
158
+
159
+
160
+ # ---------------------------------------------------------------------------
161
+ # Container bootstrap
162
+ # ---------------------------------------------------------------------------
163
+
164
+
165
+ def _load_container_for_profile(profile_name: str) -> AppContainer:
166
+ """Load an AppContainer instance configured for the given profile."""
167
+ from plesk_unified.settings import settings as global_settings
168
+ from plesk_unified.server.bootstrap import create_app
169
+
170
+ # Force the global settings singleton to use the requested profile.
171
+ global_settings.plesk_model_profile = profile_name
172
+ container = create_app(Path(os.getcwd()), global_settings)
173
+ return container
174
+
175
+
176
+ # ---------------------------------------------------------------------------
177
+ # Engine / retrieval
178
+ # ---------------------------------------------------------------------------
179
+
180
+
181
+ def _get_selected_engine(
182
+ query_str: str,
183
+ bucket: str,
184
+ routing_policy: str,
185
+ engine_name: str,
186
+ pilot_config: StructurePilotConfig | None,
187
+ ) -> tuple[str, StructurePilotConfig | None, str]:
188
+ """Determine the engine and pilot config based on the routing policy."""
189
+ if routing_policy and routing_policy != "baseline-only":
190
+ decision = route_query(query_str, bucket, routing_policy=routing_policy)
191
+ return decision.engine, decision.pilot_config, decision.reason
192
+ return engine_name, pilot_config, "manual-engine"
193
+
194
+
195
+ def _perform_retrieval( # noqa: PLR0913
196
+ container: AppContainer,
197
+ query_str: str,
198
+ category: CategoryEnum | None,
199
+ candidate_limit: int,
200
+ final_k: int,
201
+ selected_engine: str,
202
+ selected_pilot_config: StructurePilotConfig | None,
203
+ ) -> list[dict[str, Any]]:
204
+ """Execute search and reranking steps using SearchService."""
205
+ initial_results: list[dict[str, Any]] = asyncio.run(
206
+ container.search_service.search_raw(
207
+ query=query_str,
208
+ category=category.value if category else None,
209
+ )
210
+ )
211
+
212
+ # PageIndex pilot is applied AFTER initial search and reranking.
213
+ if selected_engine == "pageindex-pilot" and initial_results:
214
+ reranked_dicts = rerank_with_structure(
215
+ query_str,
216
+ initial_results,
217
+ config=selected_pilot_config or DEFAULT_PILOT_CONFIG,
218
+ )[:final_k]
219
+ return reranked_dicts
220
+
221
+ return initial_results[:final_k]
222
+
223
+
224
+ # ---------------------------------------------------------------------------
225
+ # TurboQuant
226
+ # ---------------------------------------------------------------------------
227
+
228
+
229
+ def _init_tq_index(container: AppContainer) -> TurboQuantIndex:
230
+ """Initialise TQ index for the full-tq profile using AppContainer services."""
231
+ tq_bits = int(os.getenv("TQ_BITS", "4"))
232
+ tq_index = TurboQuantIndex(
233
+ dim=container.settings.embedding_model_dimensions,
234
+ bits=tq_bits,
235
+ device=get_optimal_device(),
236
+ )
237
+ all_docs = container.lancedb_repo.get_table().search().limit(100000).to_list()
238
+ if all_docs:
239
+ corpus_vecs = np.array([doc["vector"] for doc in all_docs], dtype=np.float32)
240
+ tq_index.add(corpus_vecs, all_docs)
241
+ return tq_index
242
+
243
+
244
+ # ---------------------------------------------------------------------------
245
+ # Main benchmark entry point
246
+ # ---------------------------------------------------------------------------
247
+
248
+
249
+ def run_benchmark( # noqa: PLR0913, PLR0915
250
+ queries: list[dict],
251
+ profile_name: str,
252
+ top_k: int = 10,
253
+ final_k: int = 5,
254
+ refresh: bool = False,
255
+ engine_name: str = "baseline",
256
+ pilot_config: StructurePilotConfig | None = None,
257
+ routing_policy: str = "baseline-only",
258
+ ragas: bool = False,
259
+ ragas_model: str | None = None,
260
+ ) -> dict[str, Any]:
261
+ """
262
+ Run the full query set against the requested profile.
263
+
264
+ Parameters
265
+ ----------
266
+ queries: List of query dicts with ``query``, ``relevant``, optional ``category``.
267
+ profile_name: Model profile name (e.g. ``"light"``, ``"medium"``).
268
+ top_k: ANN candidates before reranking.
269
+ final_k: Number of final results returned.
270
+ refresh: Whether to refresh the knowledge base first.
271
+ engine_name: ``"baseline"`` or ``"pageindex-pilot"``.
272
+ pilot_config: Structure pilot configuration (or ``None`` for default).
273
+ routing_policy: Per-query routing policy.
274
+ ragas: Whether to compute RAGAS metrics.
275
+ ragas_model: Optional model override for the RAGAS judge.
276
+
277
+ Returns
278
+ -------
279
+ dict with keys ``profile``, ``hit_rate``, ``mrr``, ``avg_latency_s``,
280
+ ``model_rss_mb``, ``per_query``, ``bucket_metrics``, etc.
281
+ """
282
+ container = _load_container_for_profile(profile_name)
283
+ rss_before = _rss_mb()
284
+
285
+ # Force model initialisation through container services
286
+ _ = container.model_runtime.get_embedding_model()
287
+ _ = container.model_runtime.get_reranker()
288
+
289
+ if refresh:
290
+ print(f" Refreshing knowledge base for profile '{profile_name}'...")
291
+ dummy_ctx = AsyncMock()
292
+ report = asyncio.run(
293
+ container.indexing_service.refresh_knowledge(
294
+ progress_callback=dummy_ctx.report_progress,
295
+ category="all",
296
+ reset_db=True,
297
+ )
298
+ )
299
+ print(f" Refresh complete:\n{report}")
300
+
301
+ rss_after = _rss_mb()
302
+ model_rss = rss_after - rss_before
303
+
304
+ if container.model_runtime.get_profile().use_turboquant:
305
+ _init_tq_index(container)
306
+
307
+ ai_client = AIClient() if ragas else None
308
+
309
+ hits, reciprocal_ranks, latencies = [], [], []
310
+ query_meta: list[dict[str, Any]] = []
311
+ bucket_metrics_raw: dict[str, dict[str, list]] = {}
312
+ ragas_metrics: list[dict[str, float]] = []
313
+
314
+ for q in queries:
315
+ t0 = time.perf_counter()
316
+ bucket = q.get("bucket") or bucket_query(q["query"])
317
+ category_str = q.get("category")
318
+ category: Optional[CategoryEnum] = None
319
+ if category_str and category_str != "mixed":
320
+ category = CategoryEnum(category_str)
321
+
322
+ bm = bucket_metrics_raw.setdefault(
323
+ bucket, {"hits": [], "rrs": [], "latencies": []}
324
+ )
325
+
326
+ sel_engine, sel_pilot, reason = _get_selected_engine(
327
+ q["query"], bucket, routing_policy, engine_name, pilot_config
328
+ )
329
+
330
+ final_search_results = _perform_retrieval(
331
+ container,
332
+ q["query"],
333
+ category,
334
+ top_k,
335
+ final_k,
336
+ sel_engine,
337
+ sel_pilot,
338
+ )
339
+
340
+ latency = time.perf_counter() - t0
341
+ rank = _hit_rank(final_search_results, q["relevant"])
342
+
343
+ hit_val = 1 if rank is not None else 0
344
+ rr_val = 1 / rank if rank is not None else 0.0
345
+ hits.append(hit_val)
346
+ reciprocal_ranks.append(rr_val)
347
+ latencies.append(latency)
348
+ bm["hits"].append(hit_val)
349
+ bm["rrs"].append(rr_val)
350
+ bm["latencies"].append(latency)
351
+
352
+ current_ragas = {}
353
+ if ragas and ai_client:
354
+ retrieved_context = "\n".join(
355
+ r.get("text", "") for r in final_search_results
356
+ )
357
+ gen_answer = ai_client.generate_answer(q["query"], retrieved_context)
358
+ current_ragas = evaluate_ragas_metrics(
359
+ q["query"],
360
+ gen_answer,
361
+ retrieved_context,
362
+ q.get("ground_truth"),
363
+ q.get("reference_context"),
364
+ ai_client,
365
+ ragas_model,
366
+ )
367
+ ragas_metrics.append(current_ragas)
368
+
369
+ top_score = (
370
+ final_search_results[0].get("_relevance", 0.0)
371
+ if final_search_results
372
+ else 0.0
373
+ )
374
+ query_meta.append(
375
+ {
376
+ "query": q["query"],
377
+ "hit": rank is not None,
378
+ "rr": rr_val,
379
+ "score": top_score,
380
+ "latency_s": latency,
381
+ "bucket": bucket,
382
+ "selected_engine": sel_engine,
383
+ "selected_pilot_config": sel_pilot.name if sel_pilot else "base",
384
+ "routing_reason": reason,
385
+ }
386
+ )
387
+
388
+ n = len(queries)
389
+ res: dict[str, Any] = {
390
+ "profile": profile_name,
391
+ "n_queries": n,
392
+ "hit_rate": sum(hits) / n if n else 0.0,
393
+ "mrr": sum(reciprocal_ranks) / n if n else 0.0,
394
+ "avg_latency_s": sum(latencies) / n if n else 0.0,
395
+ "model_rss_mb": model_rss,
396
+ "engine": engine_name,
397
+ "pilot_config": pilot_config.name if pilot_config else None,
398
+ "routing_policy": routing_policy,
399
+ "bucket_metrics": {
400
+ name: {
401
+ "n": len(m["hits"]),
402
+ "hit_rate": sum(m["hits"]) / len(m["hits"]) if m["hits"] else 0.0,
403
+ "mrr": sum(m["rrs"]) / len(m["rrs"]) if m["rrs"] else 0.0,
404
+ "avg_latency_s": (
405
+ sum(m["latencies"]) / len(m["latencies"]) if m["latencies"] else 0.0
406
+ ),
407
+ }
408
+ for name, m in bucket_metrics_raw.items()
409
+ },
410
+ "per_query": [
411
+ {
412
+ "query": q["query"],
413
+ "hit": bool(hits[i]),
414
+ "rr": reciprocal_ranks[i],
415
+ "latency_s": latencies[i],
416
+ "bucket": query_meta[i]["bucket"],
417
+ "selected_engine": query_meta[i]["selected_engine"],
418
+ "selected_pilot_config": query_meta[i]["selected_pilot_config"],
419
+ "routing_reason": query_meta[i]["routing_reason"],
420
+ **ragas_metrics[i],
421
+ }
422
+ for i, q in enumerate(queries)
423
+ ],
424
+ }
425
+
426
+ if ragas:
427
+ _add_ragas_summary(res, ragas_metrics)
428
+
429
+ # Clean up resources to prevent context leaks and memory accumulation
430
+ container.shutdown()
431
+ gc.collect()
432
+
433
+ return res
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ SUITES_DIR = Path(__file__).parent.parent / "benchmarks" / "suites"
7
+
8
+
9
+ def load_suite(name: str) -> list[dict]:
10
+ """Load a benchmark suite from its corresponding JSON file."""
11
+ path = SUITES_DIR / f"{name}.json"
12
+ if not path.exists():
13
+ # Fallback to an empty list if suite not found, or raise error if mandatory
14
+ return []
15
+ with path.open("r", encoding="utf-8") as f:
16
+ return json.load(f)
17
+
18
+
19
+ # Load built-in suites from JSON definitions
20
+ BUILTIN_QUERIES = load_suite("control")
21
+ STRUCTURAL_QUERIES = load_suite("structural")
22
+ LONG_DOC_QUERIES = load_suite("long-doc")
23
+ MULTI_HOP_QUERIES = load_suite("multi-hop")
24
+
25
+ BENCHMARK_SUITES: dict[str, list[dict]] = {
26
+ "control": BUILTIN_QUERIES,
27
+ "structural": STRUCTURAL_QUERIES,
28
+ "long-doc": LONG_DOC_QUERIES,
29
+ "multi-hop": MULTI_HOP_QUERIES,
30
+ }