mcp-plesk-dev-docs 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_plesk_dev_docs-0.4.2.dist-info/METADATA +221 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/RECORD +30 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/WHEEL +5 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/entry_points.txt +2 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/licenses/LICENSE +21 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/licenses/NOTICE +0 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/top_level.txt +1 -0
- plesk_unified/__init__.py +3 -0
- plesk_unified/ai_client.py +257 -0
- plesk_unified/benchmark_engines.py +330 -0
- plesk_unified/benchmark_gates.py +254 -0
- plesk_unified/benchmark_reporting.py +107 -0
- plesk_unified/benchmark_runner.py +433 -0
- plesk_unified/benchmark_suites.py +30 -0
- plesk_unified/chunking.py +360 -0
- plesk_unified/error_handling.py +112 -0
- plesk_unified/html_utils.py +217 -0
- plesk_unified/indexing.py +53 -0
- plesk_unified/io_utils.py +287 -0
- plesk_unified/log_handler.py +209 -0
- plesk_unified/model_config.py +218 -0
- plesk_unified/platform_utils.py +214 -0
- plesk_unified/settings.py +93 -0
- plesk_unified/summary_cache.py +55 -0
- plesk_unified/tq_index.py +85 -0
- plesk_unified/turboquant/__init__.py +21 -0
- plesk_unified/turboquant/compressors.py +190 -0
- plesk_unified/turboquant/lloyd_max.py +190 -0
- plesk_unified/turboquant/turboquant.py +249 -0
- plesk_unified/types.py +27 -0
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
import re
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
_WORD_RE = re.compile(r"[A-Za-z0-9_./+-]+")
|
|
9
|
+
_STOPWORDS = {
|
|
10
|
+
"a",
|
|
11
|
+
"an",
|
|
12
|
+
"and",
|
|
13
|
+
"are",
|
|
14
|
+
"for",
|
|
15
|
+
"from",
|
|
16
|
+
"how",
|
|
17
|
+
"in",
|
|
18
|
+
"is",
|
|
19
|
+
"of",
|
|
20
|
+
"on",
|
|
21
|
+
"or",
|
|
22
|
+
"the",
|
|
23
|
+
"to",
|
|
24
|
+
"via",
|
|
25
|
+
"with",
|
|
26
|
+
"what",
|
|
27
|
+
"which",
|
|
28
|
+
"when",
|
|
29
|
+
"where",
|
|
30
|
+
"who",
|
|
31
|
+
"why",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True)
|
|
36
|
+
class SearchResult:
|
|
37
|
+
text: str
|
|
38
|
+
title: str
|
|
39
|
+
score: float
|
|
40
|
+
metadata: dict[str, Any]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass(frozen=True)
|
|
44
|
+
class RoutingDecision:
|
|
45
|
+
engine: str
|
|
46
|
+
pilot_config: StructurePilotConfig | None
|
|
47
|
+
reason: str
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(frozen=True)
|
|
51
|
+
class StructurePilotConfig:
|
|
52
|
+
name: str
|
|
53
|
+
title_weight: float = 2.5
|
|
54
|
+
breadcrumb_weight: float = 1.8
|
|
55
|
+
filename_weight: float = 0.9
|
|
56
|
+
text_weight: float = 0.35
|
|
57
|
+
title_phrase_bonus: float = 1.5
|
|
58
|
+
breadcrumb_phrase_bonus: float = 1.0
|
|
59
|
+
rank_weight: float = 0.5
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
DEFAULT_PILOT_CONFIG = StructurePilotConfig(name="base")
|
|
63
|
+
|
|
64
|
+
PILOT_CONFIGS: list[StructurePilotConfig] = [
|
|
65
|
+
DEFAULT_PILOT_CONFIG,
|
|
66
|
+
StructurePilotConfig(
|
|
67
|
+
name="title-focused",
|
|
68
|
+
title_weight=3.0,
|
|
69
|
+
breadcrumb_weight=1.5,
|
|
70
|
+
filename_weight=0.8,
|
|
71
|
+
text_weight=0.25,
|
|
72
|
+
title_phrase_bonus=2.0,
|
|
73
|
+
breadcrumb_phrase_bonus=0.75,
|
|
74
|
+
rank_weight=0.35,
|
|
75
|
+
),
|
|
76
|
+
StructurePilotConfig(
|
|
77
|
+
name="breadcrumb-focused",
|
|
78
|
+
title_weight=2.2,
|
|
79
|
+
breadcrumb_weight=2.4,
|
|
80
|
+
filename_weight=1.0,
|
|
81
|
+
text_weight=0.2,
|
|
82
|
+
title_phrase_bonus=1.25,
|
|
83
|
+
breadcrumb_phrase_bonus=1.5,
|
|
84
|
+
rank_weight=0.4,
|
|
85
|
+
),
|
|
86
|
+
StructurePilotConfig(
|
|
87
|
+
name="metadata-heavy",
|
|
88
|
+
title_weight=2.8,
|
|
89
|
+
breadcrumb_weight=2.2,
|
|
90
|
+
filename_weight=1.2,
|
|
91
|
+
text_weight=0.15,
|
|
92
|
+
title_phrase_bonus=1.75,
|
|
93
|
+
breadcrumb_phrase_bonus=1.25,
|
|
94
|
+
rank_weight=0.3,
|
|
95
|
+
),
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def tokenize(text: str) -> list[str]:
|
|
100
|
+
tokens = [token.lower() for token in _WORD_RE.findall(text or "")]
|
|
101
|
+
return [token for token in tokens if token not in _STOPWORDS and len(token) > 1]
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def bucket_query(query: str) -> str:
|
|
105
|
+
"""Classify a query as structural, lookup, or multi-hop for reporting/routing."""
|
|
106
|
+
normalized = (query or "").lower()
|
|
107
|
+
|
|
108
|
+
# N-gram markers for compound/multi-hop queries
|
|
109
|
+
multi_hop_markers = (
|
|
110
|
+
" and ",
|
|
111
|
+
" then ",
|
|
112
|
+
" also ",
|
|
113
|
+
"together",
|
|
114
|
+
"combined",
|
|
115
|
+
" followed by ",
|
|
116
|
+
" as well as ",
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
structural_markers = (
|
|
120
|
+
"how to",
|
|
121
|
+
"add ",
|
|
122
|
+
"create ",
|
|
123
|
+
"configure ",
|
|
124
|
+
"register ",
|
|
125
|
+
"package ",
|
|
126
|
+
"restart ",
|
|
127
|
+
"install ",
|
|
128
|
+
"set up",
|
|
129
|
+
)
|
|
130
|
+
lookup_markers = (
|
|
131
|
+
"list ",
|
|
132
|
+
"show ",
|
|
133
|
+
"find ",
|
|
134
|
+
"what is",
|
|
135
|
+
"where is",
|
|
136
|
+
"retrieve ",
|
|
137
|
+
"get ",
|
|
138
|
+
"authenticate ",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if any(marker in normalized for marker in multi_hop_markers):
|
|
142
|
+
return "multi-hop"
|
|
143
|
+
if any(marker in normalized for marker in structural_markers):
|
|
144
|
+
return "structural"
|
|
145
|
+
if any(marker in normalized for marker in lookup_markers):
|
|
146
|
+
return "lookup"
|
|
147
|
+
return "mixed"
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _field_score(
|
|
151
|
+
query_terms: list[str], value: str | None, weight: float, phrase_bonus: float
|
|
152
|
+
) -> float:
|
|
153
|
+
if not value:
|
|
154
|
+
return 0.0
|
|
155
|
+
|
|
156
|
+
value_lower = value.lower()
|
|
157
|
+
matches = sum(1 for term in query_terms if term in value_lower)
|
|
158
|
+
if matches == 0:
|
|
159
|
+
return 0.0
|
|
160
|
+
|
|
161
|
+
phrase_match = (
|
|
162
|
+
phrase_bonus
|
|
163
|
+
if " ".join(query_terms[:2]) in value_lower and len(query_terms) >= 2
|
|
164
|
+
else 0.0
|
|
165
|
+
)
|
|
166
|
+
return weight * (1.0 + math.log1p(matches) + phrase_match)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def structure_pilot_score(
|
|
170
|
+
query: str,
|
|
171
|
+
result: dict[str, Any],
|
|
172
|
+
rank: int,
|
|
173
|
+
total: int,
|
|
174
|
+
config: StructurePilotConfig = DEFAULT_PILOT_CONFIG,
|
|
175
|
+
) -> float:
|
|
176
|
+
"""Compute a PageIndex-inspired score from title and breadcrumb structure."""
|
|
177
|
+
query_terms = tokenize(query)
|
|
178
|
+
if not query_terms:
|
|
179
|
+
return 0.0
|
|
180
|
+
|
|
181
|
+
title = result.get("title", "")
|
|
182
|
+
breadcrumb = result.get("breadcrumb", "")
|
|
183
|
+
filename = result.get("filename", "")
|
|
184
|
+
text = result.get("text", "")
|
|
185
|
+
|
|
186
|
+
base_rank_bonus = 0.0
|
|
187
|
+
if total > 1:
|
|
188
|
+
base_rank_bonus = (total - rank) / (total - 1)
|
|
189
|
+
|
|
190
|
+
score = 0.0
|
|
191
|
+
score += _field_score(
|
|
192
|
+
query_terms, title, config.title_weight, config.title_phrase_bonus
|
|
193
|
+
)
|
|
194
|
+
score += _field_score(
|
|
195
|
+
query_terms,
|
|
196
|
+
breadcrumb,
|
|
197
|
+
config.breadcrumb_weight,
|
|
198
|
+
config.breadcrumb_phrase_bonus,
|
|
199
|
+
)
|
|
200
|
+
score += _field_score(query_terms, filename, config.filename_weight, 0.0)
|
|
201
|
+
score += _field_score(query_terms, text, config.text_weight, 0.0)
|
|
202
|
+
score += base_rank_bonus * config.rank_weight
|
|
203
|
+
|
|
204
|
+
normalized_query = " ".join(query_terms)
|
|
205
|
+
if normalized_query and normalized_query in (title or "").lower():
|
|
206
|
+
score += config.title_phrase_bonus
|
|
207
|
+
if normalized_query and normalized_query in (breadcrumb or "").lower():
|
|
208
|
+
score += config.breadcrumb_phrase_bonus
|
|
209
|
+
|
|
210
|
+
return score
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def rerank_with_structure(
|
|
214
|
+
query: str,
|
|
215
|
+
results: list[dict[str, Any]],
|
|
216
|
+
config: StructurePilotConfig = DEFAULT_PILOT_CONFIG,
|
|
217
|
+
) -> list[dict[str, Any]]:
|
|
218
|
+
"""Return results sorted by a structure-aware pilot score."""
|
|
219
|
+
total = len(results)
|
|
220
|
+
scored = []
|
|
221
|
+
for rank, result in enumerate(results, start=1):
|
|
222
|
+
enriched = dict(result)
|
|
223
|
+
enriched["_pilot_bucket"] = bucket_query(query)
|
|
224
|
+
enriched["_pilot_config"] = config.name
|
|
225
|
+
enriched["_pilot_score"] = structure_pilot_score(
|
|
226
|
+
query, enriched, rank, total, config=config
|
|
227
|
+
)
|
|
228
|
+
scored.append(enriched)
|
|
229
|
+
|
|
230
|
+
scored.sort(
|
|
231
|
+
key=lambda item: (
|
|
232
|
+
item.get("_pilot_score", 0.0),
|
|
233
|
+
item.get("_score_tq", item.get("_distance", item.get("_score", 0.0))),
|
|
234
|
+
),
|
|
235
|
+
reverse=True,
|
|
236
|
+
)
|
|
237
|
+
return scored
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def get_pilot_configs() -> list[StructurePilotConfig]:
|
|
241
|
+
return list(PILOT_CONFIGS)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def get_pilot_config_by_name(name: str) -> StructurePilotConfig:
|
|
245
|
+
by_name = {cfg.name: cfg for cfg in PILOT_CONFIGS}
|
|
246
|
+
return by_name.get(name, DEFAULT_PILOT_CONFIG)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def list_routing_policies() -> dict[str, str]:
|
|
250
|
+
return {
|
|
251
|
+
"baseline-only": "Always use baseline retrieval for every query.",
|
|
252
|
+
"adaptive": (
|
|
253
|
+
"Route multi-hop and targeted structural intents to pageindex-pilot; "
|
|
254
|
+
"keep lookup and generic intents on baseline."
|
|
255
|
+
),
|
|
256
|
+
"aggressive": "Always use pageindex-pilot with breadcrumb-focused config.",
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def route_query(
|
|
261
|
+
query: str,
|
|
262
|
+
bucket: str,
|
|
263
|
+
routing_policy: str = "baseline-only",
|
|
264
|
+
) -> RoutingDecision:
|
|
265
|
+
"""Select retrieval engine per-query based on policy and intent markers."""
|
|
266
|
+
normalized_policy = (routing_policy or "baseline-only").strip().lower()
|
|
267
|
+
normalized_bucket = (bucket or bucket_query(query)).strip().lower()
|
|
268
|
+
query_lower = (query or "").lower()
|
|
269
|
+
|
|
270
|
+
if normalized_policy == "baseline-only":
|
|
271
|
+
return RoutingDecision(
|
|
272
|
+
engine="baseline",
|
|
273
|
+
pilot_config=None,
|
|
274
|
+
reason="policy-baseline-only",
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
if normalized_policy == "aggressive":
|
|
278
|
+
return RoutingDecision(
|
|
279
|
+
engine="pageindex-pilot",
|
|
280
|
+
pilot_config=get_pilot_config_by_name("breadcrumb-focused"),
|
|
281
|
+
reason="policy-aggressive",
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
if normalized_policy != "adaptive":
|
|
285
|
+
return RoutingDecision(
|
|
286
|
+
engine="baseline",
|
|
287
|
+
pilot_config=None,
|
|
288
|
+
reason="policy-unknown-fallback",
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
multi_hop_markers = (
|
|
292
|
+
" and ",
|
|
293
|
+
" then ",
|
|
294
|
+
" also ",
|
|
295
|
+
"together",
|
|
296
|
+
"combined",
|
|
297
|
+
)
|
|
298
|
+
structural_markers = (
|
|
299
|
+
"how to",
|
|
300
|
+
"add ",
|
|
301
|
+
"create ",
|
|
302
|
+
"where is",
|
|
303
|
+
"which section",
|
|
304
|
+
"which page",
|
|
305
|
+
"how do i",
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
if normalized_bucket == "multi-hop" or any(
|
|
309
|
+
m in query_lower for m in multi_hop_markers
|
|
310
|
+
):
|
|
311
|
+
return RoutingDecision(
|
|
312
|
+
engine="pageindex-pilot",
|
|
313
|
+
pilot_config=get_pilot_config_by_name("breadcrumb-focused"),
|
|
314
|
+
reason="adaptive-multi-hop",
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
if normalized_bucket == "structural" and any(
|
|
318
|
+
m in query_lower for m in structural_markers
|
|
319
|
+
):
|
|
320
|
+
return RoutingDecision(
|
|
321
|
+
engine="pageindex-pilot",
|
|
322
|
+
pilot_config=get_pilot_config_by_name("base"),
|
|
323
|
+
reason="adaptive-structural",
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
return RoutingDecision(
|
|
327
|
+
engine="baseline",
|
|
328
|
+
pilot_config=None,
|
|
329
|
+
reason="adaptive-baseline",
|
|
330
|
+
)
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from statistics import mean
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
DEFAULT_GATE_CONFIG: dict[str, Any] = {
|
|
10
|
+
"regression": {
|
|
11
|
+
"hit_rate": {"max_drop": 0.01},
|
|
12
|
+
"mrr": {"max_drop": 0.02},
|
|
13
|
+
"avg_latency_s": {"max_increase_ratio": 0.20},
|
|
14
|
+
},
|
|
15
|
+
"absolute_minimums": {
|
|
16
|
+
"context_recall": 0.85,
|
|
17
|
+
"faithfulness": 0.90,
|
|
18
|
+
},
|
|
19
|
+
"required_metrics": [],
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
_NUMERIC_METRICS = (
|
|
23
|
+
"hit_rate",
|
|
24
|
+
"mrr",
|
|
25
|
+
"avg_latency_s",
|
|
26
|
+
"faithfulness",
|
|
27
|
+
"context_recall",
|
|
28
|
+
"context_precision",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _identity(run: dict[str, Any]) -> tuple[str, str, str, str]:
|
|
33
|
+
return (
|
|
34
|
+
str(run.get("suite", "control")),
|
|
35
|
+
str(run.get("profile", "unknown")),
|
|
36
|
+
str(run.get("engine", "baseline")),
|
|
37
|
+
str(run.get("routing_policy", "baseline-only")),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _identity_string(run: dict[str, Any]) -> str:
|
|
42
|
+
suite, profile, engine, routing = _identity(run)
|
|
43
|
+
return (
|
|
44
|
+
f"suite={suite}, profile={profile}, engine={engine}, routing_policy={routing}"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def aggregate_runs(runs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
49
|
+
"""Aggregate repeated runs by identity, averaging numeric quality metrics."""
|
|
50
|
+
grouped: dict[tuple[str, str, str, str], list[dict[str, Any]]] = {}
|
|
51
|
+
for run in runs:
|
|
52
|
+
grouped.setdefault(_identity(run), []).append(run)
|
|
53
|
+
|
|
54
|
+
aggregated: list[dict[str, Any]] = []
|
|
55
|
+
for key, items in grouped.items():
|
|
56
|
+
base = {
|
|
57
|
+
"suite": key[0],
|
|
58
|
+
"profile": key[1],
|
|
59
|
+
"engine": key[2],
|
|
60
|
+
"routing_policy": key[3],
|
|
61
|
+
"n_runs": len(items),
|
|
62
|
+
}
|
|
63
|
+
for metric in _NUMERIC_METRICS:
|
|
64
|
+
values = [
|
|
65
|
+
item[metric]
|
|
66
|
+
for item in items
|
|
67
|
+
if isinstance(item.get(metric), (int, float))
|
|
68
|
+
]
|
|
69
|
+
if values:
|
|
70
|
+
base[metric] = float(mean(values))
|
|
71
|
+
aggregated.append(base)
|
|
72
|
+
|
|
73
|
+
aggregated.sort(key=_identity)
|
|
74
|
+
return aggregated
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
|
78
|
+
out = dict(base)
|
|
79
|
+
for key, value in override.items():
|
|
80
|
+
if isinstance(value, dict) and isinstance(out.get(key), dict):
|
|
81
|
+
out[key] = _deep_merge(out[key], value)
|
|
82
|
+
else:
|
|
83
|
+
out[key] = value
|
|
84
|
+
return out
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def load_gate_config(path: str | None) -> dict[str, Any]:
|
|
88
|
+
config = dict(DEFAULT_GATE_CONFIG)
|
|
89
|
+
if not path:
|
|
90
|
+
return config
|
|
91
|
+
|
|
92
|
+
override = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
93
|
+
if not isinstance(override, dict):
|
|
94
|
+
raise ValueError("Gate config must be a JSON object.")
|
|
95
|
+
return _deep_merge(config, override)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def build_baseline_payload(runs: list[dict[str, Any]]) -> dict[str, Any]:
|
|
99
|
+
return {
|
|
100
|
+
"version": 1,
|
|
101
|
+
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
102
|
+
"runs": aggregate_runs(runs),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def write_baseline(path: str, runs: list[dict[str, Any]]) -> dict[str, Any]:
|
|
107
|
+
payload = build_baseline_payload(runs)
|
|
108
|
+
p = Path(path)
|
|
109
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
110
|
+
p.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
|
111
|
+
return payload
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def load_baseline(path: str) -> list[dict[str, Any]]:
|
|
115
|
+
payload = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
116
|
+
if isinstance(payload, list):
|
|
117
|
+
return aggregate_runs(payload)
|
|
118
|
+
if isinstance(payload, dict) and isinstance(payload.get("runs"), list):
|
|
119
|
+
return aggregate_runs(payload["runs"])
|
|
120
|
+
raise ValueError("Baseline file must be a JSON list or object with 'runs'.")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _check_required_metrics(run, run_name, metrics, failures):
|
|
124
|
+
"""Check that all required metrics are present in the run."""
|
|
125
|
+
for metric in metrics:
|
|
126
|
+
if metric not in run:
|
|
127
|
+
failures.append(
|
|
128
|
+
f"Missing required metric '{metric}' in current run: {run_name}"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _check_regression_gates(run, baseline, run_name, config, failures, warnings):
|
|
133
|
+
"""Check metrics against baseline to detect regressions."""
|
|
134
|
+
for metric, cfg in config.items():
|
|
135
|
+
if metric not in run:
|
|
136
|
+
warnings.append(f"Metric '{metric}' missing in current run: {run_name}")
|
|
137
|
+
continue
|
|
138
|
+
if metric not in baseline:
|
|
139
|
+
warnings.append(f"Metric '{metric}' missing in baseline run: {run_name}")
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
curr = float(run[metric])
|
|
143
|
+
base = float(baseline[metric])
|
|
144
|
+
|
|
145
|
+
if "max_drop" in cfg:
|
|
146
|
+
drop = base - curr
|
|
147
|
+
if drop > float(cfg["max_drop"]):
|
|
148
|
+
failures.append(
|
|
149
|
+
f"Regression gate failed for {metric} ({run_name}): "
|
|
150
|
+
f"drop={drop:.4f}, allowed={float(cfg['max_drop']):.4f}"
|
|
151
|
+
)
|
|
152
|
+
if "max_increase_ratio" in cfg and base > 0:
|
|
153
|
+
ratio = (curr - base) / base
|
|
154
|
+
if ratio > float(cfg["max_increase_ratio"]):
|
|
155
|
+
failures.append(
|
|
156
|
+
f"Regression gate failed for {metric} ({run_name}): "
|
|
157
|
+
f"increase_ratio={ratio:.4f}, "
|
|
158
|
+
f"allowed={float(cfg['max_increase_ratio']):.4f}"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _check_absolute_minimums(run, run_name, config, failures, warnings):
|
|
163
|
+
"""Check metrics against absolute minimum thresholds."""
|
|
164
|
+
for metric, threshold in config.items():
|
|
165
|
+
if metric not in run:
|
|
166
|
+
warnings.append(
|
|
167
|
+
f"Absolute gate metric '{metric}' missing in current run: {run_name}"
|
|
168
|
+
)
|
|
169
|
+
continue
|
|
170
|
+
if float(run[metric]) < float(threshold):
|
|
171
|
+
failures.append(
|
|
172
|
+
f"Absolute gate failed for {metric} ({run_name}): "
|
|
173
|
+
f"value={float(run[metric]):.4f}, threshold={float(threshold):.4f}"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _check_metrics(
|
|
178
|
+
run: dict[str, Any],
|
|
179
|
+
baseline: dict[str, Any],
|
|
180
|
+
run_name: str,
|
|
181
|
+
gate_config: dict[str, Any],
|
|
182
|
+
failures: list[str],
|
|
183
|
+
warnings: list[str],
|
|
184
|
+
):
|
|
185
|
+
"""Internal helper to check metrics for a single run against gate config."""
|
|
186
|
+
_check_required_metrics(
|
|
187
|
+
run, run_name, gate_config.get("required_metrics", []), failures
|
|
188
|
+
)
|
|
189
|
+
_check_regression_gates(
|
|
190
|
+
run,
|
|
191
|
+
baseline,
|
|
192
|
+
run_name,
|
|
193
|
+
gate_config.get("regression", {}),
|
|
194
|
+
failures,
|
|
195
|
+
warnings,
|
|
196
|
+
)
|
|
197
|
+
_check_absolute_minimums(
|
|
198
|
+
run, run_name, gate_config.get("absolute_minimums", {}), failures, warnings
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def evaluate_quality_gates(
|
|
203
|
+
current_runs: list[dict[str, Any]],
|
|
204
|
+
baseline_runs: list[dict[str, Any]],
|
|
205
|
+
gate_config: dict[str, Any],
|
|
206
|
+
) -> dict[str, Any]:
|
|
207
|
+
current = aggregate_runs(current_runs)
|
|
208
|
+
baseline_map = {_identity(run): run for run in aggregate_runs(baseline_runs)}
|
|
209
|
+
|
|
210
|
+
failures: list[str] = []
|
|
211
|
+
warnings: list[str] = []
|
|
212
|
+
|
|
213
|
+
for run in current:
|
|
214
|
+
ident = _identity(run)
|
|
215
|
+
run_name = _identity_string(run)
|
|
216
|
+
baseline = baseline_map.get(ident)
|
|
217
|
+
|
|
218
|
+
if baseline is None:
|
|
219
|
+
warnings.append(f"No baseline run matched: {run_name}")
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
_check_metrics(run, baseline, run_name, gate_config, failures, warnings)
|
|
223
|
+
|
|
224
|
+
return {
|
|
225
|
+
"passed": len(failures) == 0,
|
|
226
|
+
"failures": failures,
|
|
227
|
+
"warnings": warnings,
|
|
228
|
+
"current_count": len(current),
|
|
229
|
+
"baseline_count": len(baseline_map),
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def format_gate_report(report: dict[str, Any]) -> str:
|
|
234
|
+
lines = [
|
|
235
|
+
"\nQUALITY GATE REPORT",
|
|
236
|
+
"-" * 60,
|
|
237
|
+
f"Current runs : {report.get('current_count', 0)}",
|
|
238
|
+
f"Baseline runs: {report.get('baseline_count', 0)}",
|
|
239
|
+
]
|
|
240
|
+
|
|
241
|
+
warnings = report.get("warnings", [])
|
|
242
|
+
failures = report.get("failures", [])
|
|
243
|
+
|
|
244
|
+
if warnings:
|
|
245
|
+
lines.append("Warnings:")
|
|
246
|
+
lines.extend(f" - {w}" for w in warnings)
|
|
247
|
+
|
|
248
|
+
if failures:
|
|
249
|
+
lines.append("Failures:")
|
|
250
|
+
lines.extend(f" - {f}" for f in failures)
|
|
251
|
+
else:
|
|
252
|
+
lines.append("All configured quality gates passed.")
|
|
253
|
+
|
|
254
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Terminal reporting helpers for benchmark results.
|
|
4
|
+
|
|
5
|
+
Pure presentation logic — no internal package dependencies.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def print_result(result: dict[str, Any], final_k: int) -> None:
|
|
14
|
+
"""Print a single profile/engine benchmark result."""
|
|
15
|
+
print(f" Hit Rate (HR@{final_k}) : {result['hit_rate']:.1%}")
|
|
16
|
+
print(f" MRR@{final_k} : {result['mrr']:.3f}")
|
|
17
|
+
if "faithfulness" in result:
|
|
18
|
+
print(f" Faithfulness : {result['faithfulness']:.3f}")
|
|
19
|
+
print(f" Context Recall : {result['context_recall']:.3f}")
|
|
20
|
+
print(f" Context Precision : {result['context_precision']:.3f}")
|
|
21
|
+
print(f" Avg latency : {result['avg_latency_s']:.3f}s")
|
|
22
|
+
print(f" Model RSS delta : {result['model_rss_mb']:.0f} MB")
|
|
23
|
+
print(f" Routing policy : {result['routing_policy']}")
|
|
24
|
+
if result.get("bucket_metrics"):
|
|
25
|
+
for bucket_name, metrics in result["bucket_metrics"].items():
|
|
26
|
+
print(
|
|
27
|
+
f" {bucket_name.title()} MRR : {metrics.get('mrr', 0.0):.3f} "
|
|
28
|
+
f"(n={metrics.get('n', 0)})"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
print("\n Per-query results:")
|
|
32
|
+
for pq in result["per_query"]:
|
|
33
|
+
status = "HIT " if pq["hit"] else "MISS"
|
|
34
|
+
print(
|
|
35
|
+
f" {status} [{pq['latency_s']:.2f}s] [{pq['bucket']}] "
|
|
36
|
+
f"[{pq['selected_engine']}] {pq['query'][:70]}"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def print_summary_table(all_results: list[dict[str, Any]]) -> None:
|
|
41
|
+
"""Print a side-by-side comparison table of all benchmark results."""
|
|
42
|
+
if len(all_results) <= 1:
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
print(f"\n{'=' * 60}")
|
|
46
|
+
print("SUMMARY")
|
|
47
|
+
print("=" * 60)
|
|
48
|
+
has_ragas = any("faithfulness" in r for r in all_results)
|
|
49
|
+
if has_ragas:
|
|
50
|
+
header = (
|
|
51
|
+
f"{'Profile':<10} {'Engine':<15} {'HR@5':>8} {'MRR@5':>8} "
|
|
52
|
+
f"{'Faith':>8} {'Recall':>8} {'Prec':>8} {'Latency':>10}"
|
|
53
|
+
)
|
|
54
|
+
else:
|
|
55
|
+
header = (
|
|
56
|
+
f"{'Profile':<10} {'Engine':<15} {'HR@5':>8} {'MRR@5':>8} "
|
|
57
|
+
f"{'Latency':>10} {'RSS MB':>10}"
|
|
58
|
+
)
|
|
59
|
+
print(header)
|
|
60
|
+
print("-" * len(header))
|
|
61
|
+
for result in all_results:
|
|
62
|
+
if has_ragas:
|
|
63
|
+
print(
|
|
64
|
+
f"{result['profile']:<10} {result['engine']:<15} "
|
|
65
|
+
f"{result['hit_rate']:>7.1%} "
|
|
66
|
+
f"{result['mrr']:>8.3f} "
|
|
67
|
+
f"{result.get('faithfulness', 0.0):>8.3f} "
|
|
68
|
+
f"{result.get('context_recall', 0.0):>8.3f} "
|
|
69
|
+
f"{result.get('context_precision', 0.0):>8.3f} "
|
|
70
|
+
f"{result['avg_latency_s']:>9.3f}s"
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
print(
|
|
74
|
+
f"{result['profile']:<10} {result['engine']:<15} "
|
|
75
|
+
f"{result['hit_rate']:>7.1%} "
|
|
76
|
+
f"{result['mrr']:>8.3f} "
|
|
77
|
+
f"{result['avg_latency_s']:>9.3f}s "
|
|
78
|
+
f"{result['model_rss_mb']:>9.0f}"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def print_autoresearch_summary(all_results: list[dict[str, Any]]) -> None:
|
|
83
|
+
"""Print the autoresearch summary, highlighting the best structural config."""
|
|
84
|
+
if not all_results:
|
|
85
|
+
return
|
|
86
|
+
|
|
87
|
+
pageindex_runs = [r for r in all_results if r.get("engine") == "pageindex-pilot"]
|
|
88
|
+
if not pageindex_runs:
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
structural_best = max(
|
|
92
|
+
pageindex_runs,
|
|
93
|
+
key=lambda r: r.get("bucket_metrics", {}).get("structural", {}).get("mrr", 0.0),
|
|
94
|
+
)
|
|
95
|
+
structural_metrics = structural_best.get("bucket_metrics", {}).get("structural", {})
|
|
96
|
+
structural_mrr = structural_metrics.get("mrr", 0.0)
|
|
97
|
+
print("\nAUTORESEARCH SUMMARY")
|
|
98
|
+
print("-" * 60)
|
|
99
|
+
print(
|
|
100
|
+
f"Best structural config: {structural_best.get('pilot_config') or 'base'} "
|
|
101
|
+
f"(MRR={structural_mrr:.3f})"
|
|
102
|
+
)
|
|
103
|
+
print(
|
|
104
|
+
"Stop condition: if the structural MRR no longer improves across the "
|
|
105
|
+
"pilot configs, "
|
|
106
|
+
"the structure-aware ceiling has been reached."
|
|
107
|
+
)
|