inferencebench-mt 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ """InferenceBench LLM machine-translation plugin."""
2
+
3
+ from inferencebench_mt.plugin import EXPECTED_METRICS, LLMMTPlugin
4
+ from inferencebench_mt.schemas import BenchmarkSpec, EngineKind, RunContext
5
+
6
+ __all__ = [
7
+ "EXPECTED_METRICS",
8
+ "BenchmarkSpec",
9
+ "EngineKind",
10
+ "LLMMTPlugin",
11
+ "RunContext",
12
+ ]
@@ -0,0 +1,14 @@
1
+ benchmark_id: llm.mt.flores-200-mini-en-de
2
+ suite_version: 1.0.0
3
+ description: FLORES-200-style English to German, character F-score.
4
+ modality: llm
5
+ kind: translation
6
+ dataset:
7
+ id: builtin-flores-mini-en-de
8
+ path: flores-mini-en-de.jsonl
9
+ slo_template: llm.mt.standard
10
+ warmup:
11
+ discard_runs: 0
12
+ scoring: chrf
13
+ source_lang: en
14
+ target_lang: de
@@ -0,0 +1,14 @@
1
+ benchmark_id: llm.mt.flores-200-mini-en-es
2
+ suite_version: 1.0.0
3
+ description: FLORES-200-style English to Spanish, character F-score.
4
+ modality: llm
5
+ kind: translation
6
+ dataset:
7
+ id: builtin-flores-mini-en-es
8
+ path: flores-mini-en-es.jsonl
9
+ slo_template: llm.mt.standard
10
+ warmup:
11
+ discard_runs: 0
12
+ scoring: chrf
13
+ source_lang: en
14
+ target_lang: es
@@ -0,0 +1,14 @@
1
+ benchmark_id: llm.mt.flores-200-mini-en-fr
2
+ suite_version: 1.0.0
3
+ description: FLORES-200-style English to French, character F-score.
4
+ modality: llm
5
+ kind: translation
6
+ dataset:
7
+ id: builtin-flores-mini-en-fr
8
+ path: flores-mini-en-fr.jsonl
9
+ slo_template: llm.mt.standard
10
+ warmup:
11
+ discard_runs: 0
12
+ scoring: chrf
13
+ source_lang: en
14
+ target_lang: fr
@@ -0,0 +1,14 @@
1
+ benchmark_id: llm.mt.flores-200-mini-en-ja
2
+ suite_version: 1.0.0
3
+ description: FLORES-200-style English to Japanese, character F-score.
4
+ modality: llm
5
+ kind: translation
6
+ dataset:
7
+ id: builtin-flores-mini-en-ja
8
+ path: flores-mini-en-ja.jsonl
9
+ slo_template: llm.mt.standard
10
+ warmup:
11
+ discard_runs: 0
12
+ scoring: chrf
13
+ source_lang: en
14
+ target_lang: ja
@@ -0,0 +1,8 @@
1
+ {"source": "Hello, how are you?", "reference": "Hallo, wie geht es dir?", "domain": "greeting"}
2
+ {"source": "Good morning, my friend.", "reference": "Guten Morgen, mein Freund.", "domain": "greeting"}
3
+ {"source": "The president signed the new climate agreement yesterday.", "reference": "Der Präsident unterzeichnete gestern das neue Klimaabkommen.", "domain": "news"}
4
+ {"source": "Stock markets fell sharply after the central bank announcement.", "reference": "Die Aktienmärkte fielen nach der Ankündigung der Zentralbank stark.", "domain": "news"}
5
+ {"source": "The transformer architecture uses self-attention layers.", "reference": "Die Transformer-Architektur verwendet Self-Attention-Schichten.", "domain": "technical"}
6
+ {"source": "Please restart the server after the update completes.", "reference": "Bitte starten Sie den Server neu, nachdem das Update abgeschlossen ist.", "domain": "technical"}
7
+ {"source": "I would like a coffee with milk and sugar, please.", "reference": "Ich hätte gerne einen Kaffee mit Milch und Zucker, bitte.", "domain": "conversational"}
8
+ {"source": "Where is the nearest train station?", "reference": "Wo ist der nächste Bahnhof?", "domain": "conversational"}
@@ -0,0 +1,8 @@
1
+ {"source": "Hello, how are you?", "reference": "Hola, ¿cómo estás?", "domain": "greeting"}
2
+ {"source": "Good morning, my friend.", "reference": "Buenos días, mi amigo.", "domain": "greeting"}
3
+ {"source": "The president signed the new climate agreement yesterday.", "reference": "El presidente firmó ayer el nuevo acuerdo climático.", "domain": "news"}
4
+ {"source": "Stock markets fell sharply after the central bank announcement.", "reference": "Los mercados bursátiles cayeron bruscamente tras el anuncio del banco central.", "domain": "news"}
5
+ {"source": "The transformer architecture uses self-attention layers.", "reference": "La arquitectura transformer utiliza capas de autoatención.", "domain": "technical"}
6
+ {"source": "Please restart the server after the update completes.", "reference": "Por favor, reinicia el servidor cuando termine la actualización.", "domain": "technical"}
7
+ {"source": "I would like a coffee with milk and sugar, please.", "reference": "Quisiera un café con leche y azúcar, por favor.", "domain": "conversational"}
8
+ {"source": "Where is the nearest train station?", "reference": "¿Dónde está la estación de tren más cercana?", "domain": "conversational"}
@@ -0,0 +1,8 @@
1
+ {"source": "Hello, how are you?", "reference": "Bonjour, comment allez-vous ?", "domain": "greeting"}
2
+ {"source": "Good morning, my friend.", "reference": "Bonjour, mon ami.", "domain": "greeting"}
3
+ {"source": "The president signed the new climate agreement yesterday.", "reference": "Le président a signé le nouvel accord sur le climat hier.", "domain": "news"}
4
+ {"source": "Stock markets fell sharply after the central bank announcement.", "reference": "Les marchés boursiers ont fortement chuté après l'annonce de la banque centrale.", "domain": "news"}
5
+ {"source": "The transformer architecture uses self-attention layers.", "reference": "L'architecture transformeur utilise des couches d'auto-attention.", "domain": "technical"}
6
+ {"source": "Please restart the server after the update completes.", "reference": "Veuillez redémarrer le serveur une fois la mise à jour terminée.", "domain": "technical"}
7
+ {"source": "I would like a coffee with milk and sugar, please.", "reference": "Je voudrais un café avec du lait et du sucre, s'il vous plaît.", "domain": "conversational"}
8
+ {"source": "Where is the nearest train station?", "reference": "Où se trouve la gare la plus proche ?", "domain": "conversational"}
@@ -0,0 +1,8 @@
1
+ {"source": "Hello, how are you?", "reference": "こんにちは、お元気ですか?", "domain": "greeting"}
2
+ {"source": "Good morning, my friend.", "reference": "おはよう、友よ。", "domain": "greeting"}
3
+ {"source": "The president signed the new climate agreement yesterday.", "reference": "大統領は昨日、新しい気候協定に署名しました。", "domain": "news"}
4
+ {"source": "Stock markets fell sharply after the central bank announcement.", "reference": "中央銀行の発表を受けて、株式市場は急落しました。", "domain": "news"}
5
+ {"source": "The transformer architecture uses self-attention layers.", "reference": "トランスフォーマーアーキテクチャは自己注意層を使用します。", "domain": "technical"}
6
+ {"source": "Please restart the server after the update completes.", "reference": "アップデートが完了したらサーバーを再起動してください。", "domain": "technical"}
7
+ {"source": "I would like a coffee with milk and sugar, please.", "reference": "ミルクと砂糖入りのコーヒーをお願いします。", "domain": "conversational"}
8
+ {"source": "Where is the nearest train station?", "reference": "最寄りの駅はどこですか?", "domain": "conversational"}
@@ -0,0 +1,468 @@
1
+ """LLMMTPlugin — entry point for ``llm.mt`` benchmarks.
2
+
3
+ The fifth-modality sibling to the perf / quality / voice / embeddings plugins.
4
+ Implements the same contract (``list_benchmarks`` / ``get_benchmark`` /
5
+ ``validate`` / ``run``) but its headline metric is translation accuracy —
6
+ chrF (character n-gram F-score) by default, with token-BLEU and exact-match
7
+ as alternates. Scoring is deterministic and dependency-free; see
8
+ :mod:`inferencebench_mt.scoring`.
9
+
10
+ The plugin drives a real :class:`ModelClient` per fixture row, constructs an
11
+ MT prompt (``"Translate from {src} to {tgt}: ..."``), and scores the model's
12
+ hypothesis against the bundled reference. Self-hosted OpenAI-compatible
13
+ endpoints (vLLM, SGLang) get the LiteLLM ``openai/`` routing prefix added
14
+ exactly once — same convention as the llm-quality plugin.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import hashlib
20
+ import json
21
+ import math
22
+ import os
23
+ import time
24
+ from pathlib import Path
25
+ from typing import TYPE_CHECKING
26
+
27
+ import yaml
28
+
29
+ from inferencebench.envelope import (
30
+ DatasetSpec as EnvDatasetSpec,
31
+ )
32
+ from inferencebench.envelope import (
33
+ EngineConfig,
34
+ Envelope,
35
+ EnvelopeBuilder,
36
+ ModelConfig,
37
+ Quantization,
38
+ SigningMode,
39
+ sign_envelope,
40
+ )
41
+ from inferencebench.harness import (
42
+ CompletionResult,
43
+ ModelClient,
44
+ Sample,
45
+ collect_hardware_fingerprint,
46
+ collect_software_provenance,
47
+ )
48
+ from inferencebench.harness.metrics import EnergyReport, Percentiles, TelemetryWindow
49
+ from inferencebench_mt.schemas import BenchmarkSpec, EngineKind, RunContext
50
+ from inferencebench_mt.scoring import SCORERS
51
+
52
+ if TYPE_CHECKING:
53
+ from collections.abc import Callable
54
+
55
+
56
+ def _json_num(v: float) -> str:
57
+ """JSON-safe numeric encoder: NaN/inf become null."""
58
+ if isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
59
+ return "null"
60
+ return repr(v)
61
+
62
+
63
+ def _json_str(v: str | None) -> str:
64
+ """JSON-safe string encoder."""
65
+ return json.dumps(v if v is not None else "")
66
+
67
+
68
+ # Engines that require ``base_url`` (self-hosted OpenAI-compatible servers).
69
+ # OPENAI / COHERE here mean "provider-hosted endpoint" — base_url is optional.
70
+ _SELF_HOSTED_ENGINES = frozenset({EngineKind.VLLM, EngineKind.SGLANG})
71
+
72
+
73
+ def _fixtures_cache_root() -> Path:
74
+ """Resolve the bench-fixtures cache root for ``fixtures://`` dataset URIs."""
75
+ override = os.environ.get("BENCH_FIXTURES_ROOT")
76
+ if override:
77
+ return Path(override)
78
+ return Path.home() / ".cache" / "inferencebench" / "fixtures"
79
+
80
+
81
+ def _compute_fixture_hash(items: list[dict[str, str]]) -> str:
82
+ """SHA-256 over the canonical-JSON-encoded fixture rows."""
83
+ canonical = json.dumps(items, sort_keys=True, separators=(",", ":"))
84
+ return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
85
+
86
+
87
+ def _build_client(context: RunContext, *, timeout_s: float = 60.0) -> ModelClient:
88
+ """Build a :class:`ModelClient` from the run context.
89
+
90
+ OpenAI-compatible self-hosted servers (vLLM, SGLang) require the LiteLLM
91
+ ``openai/<model>`` routing prefix; we add it here exactly once, stripping
92
+ any user-supplied prefix first so a double ``openai/openai/...`` never
93
+ reaches LiteLLM. Provider-hosted engines (``OPENAI``, ``COHERE``) leave
94
+ the model id untouched.
95
+ """
96
+ model_id = context.model_id
97
+ api_key: str | None
98
+ if context.engine_kind in _SELF_HOSTED_ENGINES:
99
+ if model_id.startswith("openai/"):
100
+ model_id = model_id[len("openai/") :]
101
+ model_id = f"openai/{model_id}"
102
+ api_key = context.api_key or "EMPTY"
103
+ else:
104
+ api_key = context.api_key or None
105
+ return ModelClient(
106
+ model=model_id,
107
+ api_key=api_key,
108
+ base_url=context.base_url or None,
109
+ timeout_s=timeout_s,
110
+ )
111
+
112
+
113
+ def _build_prompt(source: str, source_lang: str, target_lang: str) -> str:
114
+ """Construct the translation prompt for one fixture row."""
115
+ return f"Translate from {source_lang} to {target_lang}:\n\n{source}\n\nTranslation:"
116
+
117
+
118
+ # Metrics this plugin is expected to emit. Consumed by ``bench coverage``.
119
+ EXPECTED_METRICS: tuple[str, ...] = (
120
+ "chrf_mean",
121
+ "chrf_p50",
122
+ "chrf_p95",
123
+ "ok_rate",
124
+ "n_samples",
125
+ "ttft_p50_ms",
126
+ "total_p50_ms",
127
+ )
128
+
129
+
130
+ class LLMMTPlugin:
131
+ """Plugin entry point. Registered via ``inferencebench.plugins`` entrypoint group."""
132
+
133
+ suite_id = "llm.mt"
134
+ version = "0.0.2"
135
+ description = (
136
+ "LLM machine-translation benchmarks (chrF / token-BLEU / exact-match on bundled fixtures)."
137
+ )
138
+
139
+ # ----------------------------------------------------------- benchmarks #
140
+ def list_benchmarks(self) -> list[BenchmarkSpec]:
141
+ bench_dir = self._benchmarks_dir()
142
+ specs: list[BenchmarkSpec] = []
143
+ if not bench_dir.exists():
144
+ return specs
145
+ for yml in sorted(bench_dir.glob("*.yaml")):
146
+ specs.append(self._load_yaml(yml))
147
+ return specs
148
+
149
+ def get_benchmark(self, benchmark_id: str) -> BenchmarkSpec:
150
+ for spec in self.list_benchmarks():
151
+ if spec.benchmark_id == benchmark_id:
152
+ return spec
153
+ msg = f"benchmark_id not found: {benchmark_id}"
154
+ raise KeyError(msg)
155
+
156
+ # ------------------------------------------------------------- validate #
157
+ def validate(self, spec: BenchmarkSpec, context: RunContext) -> list[str]:
158
+ warnings: list[str] = []
159
+ if not context.model_id:
160
+ warnings.append("model_id is empty")
161
+ if context.engine_kind in _SELF_HOSTED_ENGINES and not context.base_url:
162
+ warnings.append(
163
+ f"{context.engine_kind.value} needs base_url (e.g. http://localhost:8000/v1)"
164
+ )
165
+ if not self._dataset_path(spec).exists():
166
+ warnings.append(f"fixture not found: {spec.dataset.path}")
167
+ return warnings
168
+
169
+ # ------------------------------------------------------------------ run #
170
+ def run(self, spec: BenchmarkSpec, context: RunContext) -> Envelope:
171
+ """Execute the benchmark and return a SIGNED envelope."""
172
+ client = _build_client(context)
173
+ items = self._load_fixture(spec)
174
+ fixture_hash = _compute_fixture_hash(items)
175
+ scorer = SCORERS[spec.scoring]
176
+
177
+ samples, scores, telemetry = self._score_items(client, items, spec, scorer)
178
+
179
+ # Best-effort diagnostic dump — never blocks the run on I/O errors.
180
+ self._dump_samples(context, samples)
181
+
182
+ envelope = self._build_envelope(
183
+ spec,
184
+ context,
185
+ samples=samples,
186
+ scores=scores,
187
+ dataset_hash=fixture_hash,
188
+ energy=telemetry.summarise(samples),
189
+ )
190
+ signing_mode = context.extra.get("signing_mode", "dev")
191
+ dev_key_path = context.extra.get("dev_key_path")
192
+ if signing_mode == "dev":
193
+ if not dev_key_path:
194
+ msg = "dev signing requires context.extra['dev_key_path']"
195
+ raise ValueError(msg)
196
+ return sign_envelope(
197
+ envelope,
198
+ mode=SigningMode.DEV,
199
+ dev_key_path=Path(str(dev_key_path)),
200
+ )
201
+ return sign_envelope(envelope, mode=SigningMode.KEYLESS)
202
+
203
+ # -------------------------------------------------------- core scoring #
204
+ def _score_items(
205
+ self,
206
+ client: ModelClient,
207
+ items: list[dict[str, str]],
208
+ spec: BenchmarkSpec,
209
+ scorer: Callable[[str, str], float],
210
+ ) -> tuple[list[Sample], list[float], TelemetryWindow]:
211
+ """Iterate fixture items, call the model, score each hypothesis.
212
+
213
+ MT runs are per-sentence and order-independent — no driver machinery
214
+ is required. We still emit ``Sample`` objects so the envelope-building
215
+ path stays uniform with the perf plugin.
216
+ """
217
+ samples: list[Sample] = []
218
+ scores: list[float] = []
219
+ telemetry = TelemetryWindow()
220
+ with telemetry:
221
+ for idx, item in enumerate(items):
222
+ source = item["source"]
223
+ reference = item["reference"]
224
+ prompt = _build_prompt(source, spec.source_lang, spec.target_lang)
225
+ t_arrival = time.perf_counter() * 1000.0
226
+ try:
227
+ result: CompletionResult = client.complete(
228
+ prompt, stream=True, max_tokens=256
229
+ )
230
+ except Exception as exc:
231
+ samples.append(
232
+ Sample(
233
+ request_idx=idx,
234
+ arrival_ms=t_arrival,
235
+ start_ms=t_arrival,
236
+ ttft_ms=float("nan"),
237
+ total_ms=float("nan"),
238
+ tpot_ms=float("nan"),
239
+ tokens_in=0,
240
+ tokens_out=0,
241
+ cost_usd=0.0,
242
+ finish_reason="error",
243
+ ok=False,
244
+ error=str(exc),
245
+ )
246
+ )
247
+ continue
248
+
249
+ score = float(scorer(reference, result.text))
250
+ scores.append(score)
251
+
252
+ sample_extra: dict[str, str | int | float | bool] = {
253
+ "domain": item.get("domain", ""),
254
+ "score": score,
255
+ }
256
+ samples.append(
257
+ Sample(
258
+ request_idx=idx,
259
+ arrival_ms=t_arrival,
260
+ start_ms=t_arrival,
261
+ ttft_ms=result.ttft_ms,
262
+ total_ms=result.total_ms,
263
+ tpot_ms=result.tpot_ms,
264
+ tokens_in=result.tokens_in,
265
+ tokens_out=result.tokens_out,
266
+ cost_usd=result.cost_usd,
267
+ finish_reason=result.finish_reason,
268
+ ok=True,
269
+ extra=sample_extra,
270
+ )
271
+ )
272
+ return samples, scores, telemetry
273
+
274
+ # ------------------------------------------------------------ samples #
275
+ def _dump_samples(self, context: RunContext, samples: list[Sample]) -> None:
276
+ """Write per-request samples (incl. score) to ``<output_dir>/samples-<ts>.jsonl``.
277
+
278
+ Mirrors the llm-quality plugin's diagnostic dump — failures here
279
+ never block the run.
280
+ """
281
+ try:
282
+ out_dir = Path(context.output_dir)
283
+ out_dir.mkdir(parents=True, exist_ok=True)
284
+ ts = int(time.time())
285
+ path = out_dir / f"samples-{ts}.jsonl"
286
+ with path.open("w", encoding="utf-8") as fp:
287
+ for s in samples:
288
+ score = s.extra.get("score") if s.extra else None
289
+ score_part = (
290
+ ',"score":' + _json_num(float(score))
291
+ if isinstance(score, (int, float))
292
+ else ""
293
+ )
294
+ fp.write(
295
+ '{"request_idx":'
296
+ + str(s.request_idx)
297
+ + ',"ok":'
298
+ + ("true" if s.ok else "false")
299
+ + ',"ttft_ms":'
300
+ + _json_num(s.ttft_ms)
301
+ + ',"total_ms":'
302
+ + _json_num(s.total_ms)
303
+ + ',"tokens_in":'
304
+ + str(s.tokens_in)
305
+ + ',"tokens_out":'
306
+ + str(s.tokens_out)
307
+ + score_part
308
+ + ',"finish_reason":"'
309
+ + (s.finish_reason or "")
310
+ + '"'
311
+ + (',"error":' + _json_str(s.error) if s.error else "")
312
+ + "}\n"
313
+ )
314
+ except OSError:
315
+ pass # diagnostics-only — never block the run
316
+
317
+ # ---------------------------------------------------------- file paths #
318
+ def _benchmarks_dir(self) -> Path:
319
+ return Path(__file__).parent / "benchmarks"
320
+
321
+ def _datasets_dir(self) -> Path:
322
+ return Path(__file__).parent / "datasets"
323
+
324
+ def _dataset_path(self, spec: BenchmarkSpec) -> Path:
325
+ raw = spec.dataset.path
326
+ if raw.startswith("fixtures://"):
327
+ return _fixtures_cache_root() / f"{raw[len('fixtures://') :]}.jsonl"
328
+ return self._datasets_dir() / raw
329
+
330
+ def _load_yaml(self, path: Path) -> BenchmarkSpec:
331
+ raw = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
332
+ return BenchmarkSpec.model_validate(raw)
333
+
334
+ def _load_fixture(self, spec: BenchmarkSpec) -> list[dict[str, str]]:
335
+ path = self._dataset_path(spec)
336
+ if not path.exists():
337
+ if spec.dataset.path.startswith("fixtures://"):
338
+ key = spec.dataset.path[len("fixtures://") :]
339
+ msg = f"fixture not cached: {path}. Run `bench fixtures fetch {key}` first."
340
+ raise FileNotFoundError(msg)
341
+ msg = f"fixture not found: {path}"
342
+ raise FileNotFoundError(msg)
343
+ items: list[dict[str, str]] = []
344
+ with path.open("r", encoding="utf-8") as fp:
345
+ for line in fp:
346
+ line = line.strip()
347
+ if not line:
348
+ continue
349
+ obj = json.loads(line)
350
+ if not isinstance(obj, dict):
351
+ continue
352
+ if "source" not in obj or "reference" not in obj:
353
+ continue
354
+ items.append(
355
+ {
356
+ "source": str(obj["source"]),
357
+ "reference": str(obj["reference"]),
358
+ "domain": str(obj.get("domain", "")),
359
+ }
360
+ )
361
+ if not items:
362
+ msg = f"fixture is empty: {path}"
363
+ raise ValueError(msg)
364
+ return items
365
+
366
+ # ---------------------------------------------------------- envelope #
367
+ def _build_envelope(
368
+ self,
369
+ spec: BenchmarkSpec,
370
+ context: RunContext,
371
+ *,
372
+ samples: list[Sample],
373
+ scores: list[float],
374
+ dataset_hash: str,
375
+ energy: EnergyReport | None = None,
376
+ ) -> Envelope:
377
+ hw = collect_hardware_fingerprint()
378
+ sw = collect_software_provenance()
379
+
380
+ metrics: dict[str, float | int | str | None] = {}
381
+
382
+ ok_samples = [s for s in samples if s.ok]
383
+ n_ok = len(ok_samples)
384
+ metrics["n_samples"] = float(len(samples))
385
+ metrics["n_ok"] = float(n_ok)
386
+ metrics["ok_rate"] = float(n_ok) / float(len(samples)) if samples else 0.0
387
+
388
+ # Headline scoring metric — keyed by the spec's scoring strategy so
389
+ # downstream ``bench diff`` knows the direction (all three MT scorers
390
+ # are higher-is-better).
391
+ if scores:
392
+ mean_score = sum(scores) / len(scores)
393
+ if spec.scoring == "exact_match":
394
+ metrics["exact_match_rate"] = mean_score
395
+ else:
396
+ prefix = spec.scoring # "chrf" | "bleu_token"
397
+ # Keep the metric key short for BLEU (``bleu_mean``, not
398
+ # ``bleu_token_mean``) so diff's _HIGHER_IS_BETTER policy and
399
+ # the leaderboard column headers stay readable.
400
+ key_prefix = "bleu" if prefix == "bleu_token" else prefix
401
+ metrics[f"{key_prefix}_mean"] = mean_score
402
+ if len(scores) >= 2:
403
+ pcts = Percentiles(scores, percentiles=(50.0, 95.0))
404
+ metrics[f"{key_prefix}_p50"] = pcts.p50
405
+ metrics[f"{key_prefix}_p95"] = pcts.p95
406
+ else:
407
+ metrics[f"{key_prefix}_p50"] = mean_score
408
+ metrics[f"{key_prefix}_p95"] = mean_score
409
+
410
+ # Latency aggregates — "quality at what cost" comparisons.
411
+ ttft_vals = [s.ttft_ms for s in ok_samples if math.isfinite(s.ttft_ms)]
412
+ total_vals = [s.total_ms for s in ok_samples if math.isfinite(s.total_ms)]
413
+ if ttft_vals:
414
+ metrics["ttft_p50_ms"] = Percentiles(ttft_vals).p50
415
+ if total_vals:
416
+ metrics["total_p50_ms"] = Percentiles(total_vals).p50
417
+
418
+ tokens_out_total = sum(s.tokens_out for s in ok_samples)
419
+ if tokens_out_total:
420
+ metrics["tokens_out_total"] = float(tokens_out_total)
421
+
422
+ # Energy / power summary from telemetry (None on plugins that haven't
423
+ # threaded a TelemetryWindow through yet). Mirrors llm-inference.
424
+ if energy is not None:
425
+ if energy.gpu_power_avg_w > 0:
426
+ metrics["power_avg_w"] = energy.gpu_power_avg_w
427
+ metrics["power_peak_w"] = energy.gpu_power_peak_w
428
+ if energy.total_energy_joules > 0:
429
+ metrics["energy_joules_total"] = energy.total_energy_joules
430
+ if energy.joules_per_token == energy.joules_per_token: # not NaN
431
+ metrics["joules_per_token"] = energy.joules_per_token
432
+
433
+ # Cost: only emit when the provider actually reported it. Self-hosted
434
+ # vLLM / SGLang never do; the perf plugin's pricing-registry fallback
435
+ # is intentionally NOT mirrored here — MT runs are cheap enough
436
+ # that a missing-cost row is more honest than an estimated one.
437
+ cost_total = sum(s.cost_usd for s in ok_samples)
438
+ if tokens_out_total and cost_total > 0:
439
+ metrics["cost_usd_per_million_tokens"] = (cost_total / tokens_out_total) * 1e6
440
+ metrics["cost_source"] = "provider"
441
+
442
+ builder = EnvelopeBuilder(
443
+ suite_id=spec.benchmark_id,
444
+ suite_version=spec.suite_version,
445
+ model=ModelConfig(
446
+ id=context.model_id,
447
+ revision=context.model_revision,
448
+ provider=context.engine_kind.value,
449
+ endpoint_hash="0" * 64,
450
+ ),
451
+ engine=EngineConfig(
452
+ name=context.engine_kind.value,
453
+ version=context.engine_version or "unknown",
454
+ config_hash="0" * 64,
455
+ ),
456
+ hardware_fingerprint=hw,
457
+ software_provenance=sw,
458
+ dataset=EnvDatasetSpec(id=spec.dataset.id, hash=dataset_hash),
459
+ seed=0,
460
+ quantization=(
461
+ Quantization(format=context.quantization_format)
462
+ if context.quantization_format
463
+ else None
464
+ ),
465
+ metrics=metrics,
466
+ slo_template=spec.slo_template,
467
+ )
468
+ return builder.build()
File without changes
@@ -0,0 +1,91 @@
1
+ """Pydantic schemas for llm-mt benchmark specs + run context."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import StrEnum
6
+ from pathlib import Path
7
+ from typing import Annotated, Literal
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+
11
+
12
+ class EngineKind(StrEnum):
13
+ """Engines this plugin can drive.
14
+
15
+ Machine translation is dominated by per-prompt API calls, so the four
16
+ most useful endpoints are self-hosted OpenAI-compatible servers (vLLM,
17
+ SGLang), provider-hosted OpenAI, and Cohere — whose Aya / Command
18
+ models are popular MT picks.
19
+ """
20
+
21
+ VLLM = "vllm"
22
+ SGLANG = "sglang"
23
+ OPENAI = "openai"
24
+ COHERE = "cohere"
25
+
26
+
27
+ class DatasetConfig(BaseModel):
28
+ """Dataset under evaluation.
29
+
30
+ For the MT plugin the dataset is a small bundled JSONL fixture with one
31
+ ``{"source", "reference", "domain"}`` object per line.
32
+ """
33
+
34
+ model_config = ConfigDict(extra="forbid")
35
+ id: Annotated[str, Field(min_length=1)]
36
+ path: Annotated[
37
+ str,
38
+ Field(
39
+ min_length=1,
40
+ description=("Path to the fixture JSONL relative to the plugin's datasets/ directory."),
41
+ ),
42
+ ]
43
+
44
+
45
+ class WarmupConfig(BaseModel):
46
+ """Warmup parameters.
47
+
48
+ MT scoring is per-sentence and order-independent, so the default is
49
+ zero discarded runs. Surfaced as a knob so future revisions can warm
50
+ up server-side weights if needed.
51
+ """
52
+
53
+ model_config = ConfigDict(extra="forbid")
54
+ discard_runs: Annotated[int, Field(ge=0)] = 0
55
+
56
+
57
+ class BenchmarkSpec(BaseModel):
58
+ """One MT benchmark — fixture + scoring strategy + language pair + metadata."""
59
+
60
+ model_config = ConfigDict(extra="forbid")
61
+ benchmark_id: Annotated[str, Field(min_length=1)]
62
+ suite_version: Annotated[str, Field(pattern=r"^\d+\.\d+\.\d+(-[\w.]+)?$")]
63
+ description: str = ""
64
+ modality: Literal["llm"] = "llm"
65
+ kind: Literal["translation"] = "translation"
66
+ dataset: DatasetConfig
67
+ slo_template: str = "llm.mt.standard"
68
+ warmup: WarmupConfig = Field(default_factory=WarmupConfig)
69
+ scoring: Literal["chrf", "bleu_token", "exact_match"] = "chrf"
70
+ source_lang: Annotated[str, Field(min_length=2, max_length=8)]
71
+ target_lang: Annotated[str, Field(min_length=2, max_length=8)]
72
+
73
+
74
+ class RunContext(BaseModel):
75
+ """Per-invocation context (where to send requests, where to write results).
76
+
77
+ Mirrors the llm-quality plugin shape so cross-plugin tooling can reuse
78
+ the same context object.
79
+ """
80
+
81
+ model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
82
+ model_id: Annotated[str, Field(min_length=1)]
83
+ model_revision: Annotated[str, Field(min_length=7, max_length=40)] = "unknown00"
84
+ engine_kind: EngineKind
85
+ engine_version: str = ""
86
+ base_url: str = ""
87
+ api_key: str = ""
88
+ quantization_format: str = ""
89
+ hardware_class: str = ""
90
+ output_dir: Path
91
+ extra: dict[str, str | int | float | bool] = Field(default_factory=dict)
@@ -0,0 +1,131 @@
1
+ """Deterministic scoring strategies for the llm-mt plugin.
2
+
3
+ Three pure functions, each ``(reference, hypothesis) -> float`` in ``[0.0, 1.0]``:
4
+
5
+ - :func:`chrf` — character n-gram F-score (the standard chrF metric).
6
+ - :func:`bleu_token` — corpus-free token BLEU with brevity penalty.
7
+ - :func:`exact_match` — strict strip + lowercase equality (mirrors llm-quality).
8
+
9
+ All three are higher-is-better — translation accuracy in :math:`[0, 1]`. No
10
+ external dependencies. Whitespace is normalised by collapsing runs before
11
+ character-n-gram extraction so chrF behaves the same on ``"hello world"``
12
+ and ``"hello world"``.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import math
18
+ from collections import Counter
19
+ from collections.abc import Callable
20
+
21
+
22
+ def _char_ngrams(text: str, n: int) -> Counter[str]:
23
+ """Return the multiset of character n-grams of length ``n`` in ``text``.
24
+
25
+ Whitespace is collapsed (any run of whitespace becomes a single space) so
26
+ formatting noise does not perturb the score. ``n`` must be at least 1.
27
+ """
28
+ normalised = " ".join(text.split())
29
+ if len(normalised) < n:
30
+ return Counter()
31
+ return Counter(normalised[i : i + n] for i in range(len(normalised) - n + 1))
32
+
33
+
34
+ def chrf(reference: str, hypothesis: str, n: int = 6, beta: float = 2.0) -> float:
35
+ """Character n-gram F-score (chrF).
36
+
37
+ Collects character n-grams up to length ``n`` in both ``reference`` and
38
+ ``hypothesis``, computes a single precision / recall / F_beta over the
39
+ union of all n-gram orders, and returns the result in ``[0, 1]``. This
40
+ is the simplest, hand-computable variant of the metric: order-uniform
41
+ weighting, no separate word-vs-char split.
42
+
43
+ Returns 1.0 when reference and hypothesis are identical (after
44
+ whitespace normalisation) and 0.0 when no n-gram of any order overlaps.
45
+ An empty reference and hypothesis match exactly (returns 1.0); only one
46
+ side empty returns 0.0.
47
+ """
48
+ if n < 1:
49
+ msg = "chrf requires n >= 1"
50
+ raise ValueError(msg)
51
+ ref_norm = " ".join(reference.split())
52
+ hyp_norm = " ".join(hypothesis.split())
53
+ if not ref_norm and not hyp_norm:
54
+ return 1.0
55
+ if not ref_norm or not hyp_norm:
56
+ return 0.0
57
+
58
+ total_match = 0
59
+ total_hyp = 0
60
+ total_ref = 0
61
+ for order in range(1, n + 1):
62
+ ref_ngrams = _char_ngrams(reference, order)
63
+ hyp_ngrams = _char_ngrams(hypothesis, order)
64
+ if not ref_ngrams or not hyp_ngrams:
65
+ continue
66
+ overlap = sum((ref_ngrams & hyp_ngrams).values())
67
+ total_match += overlap
68
+ total_hyp += sum(hyp_ngrams.values())
69
+ total_ref += sum(ref_ngrams.values())
70
+
71
+ if total_hyp == 0 or total_ref == 0 or total_match == 0:
72
+ return 0.0
73
+ precision = total_match / total_hyp
74
+ recall = total_match / total_ref
75
+ beta_sq = beta * beta
76
+ denom = beta_sq * precision + recall
77
+ if denom == 0:
78
+ return 0.0
79
+ return (1.0 + beta_sq) * precision * recall / denom
80
+
81
+
82
+ def bleu_token(reference: str, hypothesis: str, max_n: int = 4) -> float:
83
+ """Simple corpus-free BLEU over whitespace-split tokens.
84
+
85
+ Computes modified n-gram precisions for ``n`` in ``1..max_n``, takes
86
+ their geometric mean, and multiplies by the standard brevity penalty
87
+ ``exp(min(0, 1 - r/c))``. Returns 0.0 when any n-gram order has zero
88
+ precision (the canonical BLEU behaviour — no smoothing here, since the
89
+ metric is informational only for the skeleton). Result is in ``[0, 1]``.
90
+ """
91
+ if max_n < 1:
92
+ msg = "bleu_token requires max_n >= 1"
93
+ raise ValueError(msg)
94
+ hyp_tokens = hypothesis.split()
95
+ ref_tokens = reference.split()
96
+ if not hyp_tokens or not ref_tokens:
97
+ return 0.0
98
+
99
+ precisions: list[float] = []
100
+ for n in range(1, max_n + 1):
101
+ if len(hyp_tokens) < n or len(ref_tokens) < n:
102
+ return 0.0
103
+ hyp_ngrams = Counter(tuple(hyp_tokens[i : i + n]) for i in range(len(hyp_tokens) - n + 1))
104
+ ref_ngrams = Counter(tuple(ref_tokens[i : i + n]) for i in range(len(ref_tokens) - n + 1))
105
+ # Modified count: cap each n-gram by its reference count.
106
+ clipped = sum(min(c, ref_ngrams[g]) for g, c in hyp_ngrams.items())
107
+ total = sum(hyp_ngrams.values())
108
+ if total == 0 or clipped == 0:
109
+ return 0.0
110
+ precisions.append(clipped / total)
111
+
112
+ # Geometric mean of precisions.
113
+ log_sum = sum(math.log(p) for p in precisions)
114
+ geo_mean = math.exp(log_sum / len(precisions))
115
+
116
+ c = len(hyp_tokens)
117
+ r = len(ref_tokens)
118
+ brevity = 1.0 if c > r else math.exp(1.0 - r / c)
119
+ return brevity * geo_mean
120
+
121
+
122
+ def exact_match(reference: str, hypothesis: str) -> float:
123
+ """Return 1.0 iff ``hypothesis`` equals ``reference`` after strip + lowercase."""
124
+ return 1.0 if hypothesis.strip().lower() == reference.strip().lower() else 0.0
125
+
126
+
127
+ SCORERS: dict[str, Callable[[str, str], float]] = {
128
+ "chrf": chrf,
129
+ "bleu_token": bleu_token,
130
+ "exact_match": exact_match,
131
+ }
@@ -0,0 +1,39 @@
1
+ Metadata-Version: 2.4
2
+ Name: inferencebench-mt
3
+ Version: 0.0.2
4
+ Summary: Machine-translation plugin for InferenceBench Suite (chrF/BLEU/exact-match on bundled fixtures)
5
+ Project-URL: Homepage, https://github.com/yobitelcomm/bench
6
+ Author-email: Yobitel Communications <bench@yobitel.com>
7
+ License: Apache-2.0
8
+ Keywords: ai,benchmark,bleu,chrf,llm,ml,translation
9
+ Classifier: Development Status :: 2 - Pre-Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.12
17
+ Requires-Dist: inferencebench-envelope
18
+ Requires-Dist: inferencebench-harness
19
+ Requires-Dist: pydantic~=2.9
20
+ Requires-Dist: pyyaml~=6.0
21
+ Description-Content-Type: text/markdown
22
+
23
+ # inferencebench-mt
24
+
25
+ Machine-translation plugin for the InferenceBench Suite.
26
+
27
+ Scores model translations against bundled reference fixtures using chrF (character
28
+ n-gram F-score), token-level BLEU, or exact match. Mirrors the contract of the
29
+ other plugins (`list_benchmarks` / `get_benchmark` / `validate` / `run`) and
30
+ emits the canonical signed envelope.
31
+
32
+ Two bundled benchmarks ship out of the box:
33
+
34
+ - `llm.mt.flores-200-mini-en-fr` — FLORES-200-style English to French, chrF.
35
+ - `llm.mt.flores-200-mini-en-de` — FLORES-200-style English to German, chrF.
36
+
37
+ The fixtures are tiny (eight rows each, mixed across greeting / news / technical
38
+ / conversational domains) — intended for skeleton verification, not headline
39
+ numbers.
@@ -0,0 +1,17 @@
1
+ inferencebench_mt/__init__.py,sha256=bAb5E7Q5wiomAtN_QWcYEQxj7bL_DY4HVidDUHr4WVY,312
2
+ inferencebench_mt/plugin.py,sha256=5S-Hdzymsiivc6YDR5s-ogaYj0esvtKU9LJ15TkIZ08,18607
3
+ inferencebench_mt/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ inferencebench_mt/schemas.py,sha256=tpJ206CggNjdX2VEJZyfOEwNEhoPpFdBlDeNsU6M8I0,2943
5
+ inferencebench_mt/scoring.py,sha256=_z80lCIUrJ3Uy___xG9wiFzP_dW3ejEVFylKrm573bA,4998
6
+ inferencebench_mt/benchmarks/flores-200-mini-en-de.yaml,sha256=kUxm5fJOJ_a3QUUAiUmNGxZuSXW7-Sa0jir7a53vdEA,339
7
+ inferencebench_mt/benchmarks/flores-200-mini-en-es.yaml,sha256=lkCSbufCcn9lvPoKxQ1HJrRrOLHdWWiu4pScxyrLOts,340
8
+ inferencebench_mt/benchmarks/flores-200-mini-en-fr.yaml,sha256=BEnt14wkrwfBV1-ZrD9vli39x4zV6E4CQf_eOebl7jg,339
9
+ inferencebench_mt/benchmarks/flores-200-mini-en-ja.yaml,sha256=oCGXI2NuZI658Z83Azo4CqCHVjguXo5pl9p2zWCRkp8,341
10
+ inferencebench_mt/datasets/flores-mini-en-de.jsonl,sha256=ZweW2R0Ku98t8NBrFJP-eiVIgORVZ6TaJMLjtNpIjN8,1194
11
+ inferencebench_mt/datasets/flores-mini-en-es.jsonl,sha256=z9i5MYZGNeZSnSEeqjHPvT45kKZ5yTJ8QBto3EzdqwY,1194
12
+ inferencebench_mt/datasets/flores-mini-en-fr.jsonl,sha256=tmp6wfNyvAGgMJJsec7U4pP5TXhKp0j8bBFLn3mq8LY,1217
13
+ inferencebench_mt/datasets/flores-mini-en-ja.jsonl,sha256=s9mOfQN2WIwf-o-bbRaLl2jM9-QtwNwynmIc0xy3Cl4,1263
14
+ inferencebench_mt-0.0.2.dist-info/METADATA,sha256=jRL7nbrmomSuHuuE5xMWelnJ1lGbFK_VSb5nkU3_6B0,1657
15
+ inferencebench_mt-0.0.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
16
+ inferencebench_mt-0.0.2.dist-info/entry_points.txt,sha256=vsbyHlBHcuBrS1IEEg2YJRNR5cMOQs2IX8PoRO3pALw,71
17
+ inferencebench_mt-0.0.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [inferencebench.plugins]
2
+ llm.mt = inferencebench_mt.plugin:LLMMTPlugin