inferencebench-mt 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inferencebench_mt/__init__.py +12 -0
- inferencebench_mt/benchmarks/flores-200-mini-en-de.yaml +14 -0
- inferencebench_mt/benchmarks/flores-200-mini-en-es.yaml +14 -0
- inferencebench_mt/benchmarks/flores-200-mini-en-fr.yaml +14 -0
- inferencebench_mt/benchmarks/flores-200-mini-en-ja.yaml +14 -0
- inferencebench_mt/datasets/flores-mini-en-de.jsonl +8 -0
- inferencebench_mt/datasets/flores-mini-en-es.jsonl +8 -0
- inferencebench_mt/datasets/flores-mini-en-fr.jsonl +8 -0
- inferencebench_mt/datasets/flores-mini-en-ja.jsonl +8 -0
- inferencebench_mt/plugin.py +468 -0
- inferencebench_mt/py.typed +0 -0
- inferencebench_mt/schemas.py +91 -0
- inferencebench_mt/scoring.py +131 -0
- inferencebench_mt-0.0.2.dist-info/METADATA +39 -0
- inferencebench_mt-0.0.2.dist-info/RECORD +17 -0
- inferencebench_mt-0.0.2.dist-info/WHEEL +4 -0
- inferencebench_mt-0.0.2.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""InferenceBench LLM machine-translation plugin."""
|
|
2
|
+
|
|
3
|
+
from inferencebench_mt.plugin import EXPECTED_METRICS, LLMMTPlugin
|
|
4
|
+
from inferencebench_mt.schemas import BenchmarkSpec, EngineKind, RunContext
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"EXPECTED_METRICS",
|
|
8
|
+
"BenchmarkSpec",
|
|
9
|
+
"EngineKind",
|
|
10
|
+
"LLMMTPlugin",
|
|
11
|
+
"RunContext",
|
|
12
|
+
]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
benchmark_id: llm.mt.flores-200-mini-en-de
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: FLORES-200-style English to German, character F-score.
|
|
4
|
+
modality: llm
|
|
5
|
+
kind: translation
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-flores-mini-en-de
|
|
8
|
+
path: flores-mini-en-de.jsonl
|
|
9
|
+
slo_template: llm.mt.standard
|
|
10
|
+
warmup:
|
|
11
|
+
discard_runs: 0
|
|
12
|
+
scoring: chrf
|
|
13
|
+
source_lang: en
|
|
14
|
+
target_lang: de
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
benchmark_id: llm.mt.flores-200-mini-en-es
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: FLORES-200-style English to Spanish, character F-score.
|
|
4
|
+
modality: llm
|
|
5
|
+
kind: translation
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-flores-mini-en-es
|
|
8
|
+
path: flores-mini-en-es.jsonl
|
|
9
|
+
slo_template: llm.mt.standard
|
|
10
|
+
warmup:
|
|
11
|
+
discard_runs: 0
|
|
12
|
+
scoring: chrf
|
|
13
|
+
source_lang: en
|
|
14
|
+
target_lang: es
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
benchmark_id: llm.mt.flores-200-mini-en-fr
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: FLORES-200-style English to French, character F-score.
|
|
4
|
+
modality: llm
|
|
5
|
+
kind: translation
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-flores-mini-en-fr
|
|
8
|
+
path: flores-mini-en-fr.jsonl
|
|
9
|
+
slo_template: llm.mt.standard
|
|
10
|
+
warmup:
|
|
11
|
+
discard_runs: 0
|
|
12
|
+
scoring: chrf
|
|
13
|
+
source_lang: en
|
|
14
|
+
target_lang: fr
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
benchmark_id: llm.mt.flores-200-mini-en-ja
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: FLORES-200-style English to Japanese, character F-score.
|
|
4
|
+
modality: llm
|
|
5
|
+
kind: translation
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-flores-mini-en-ja
|
|
8
|
+
path: flores-mini-en-ja.jsonl
|
|
9
|
+
slo_template: llm.mt.standard
|
|
10
|
+
warmup:
|
|
11
|
+
discard_runs: 0
|
|
12
|
+
scoring: chrf
|
|
13
|
+
source_lang: en
|
|
14
|
+
target_lang: ja
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
{"source": "Hello, how are you?", "reference": "Hallo, wie geht es dir?", "domain": "greeting"}
|
|
2
|
+
{"source": "Good morning, my friend.", "reference": "Guten Morgen, mein Freund.", "domain": "greeting"}
|
|
3
|
+
{"source": "The president signed the new climate agreement yesterday.", "reference": "Der Präsident unterzeichnete gestern das neue Klimaabkommen.", "domain": "news"}
|
|
4
|
+
{"source": "Stock markets fell sharply after the central bank announcement.", "reference": "Die Aktienmärkte fielen nach der Ankündigung der Zentralbank stark.", "domain": "news"}
|
|
5
|
+
{"source": "The transformer architecture uses self-attention layers.", "reference": "Die Transformer-Architektur verwendet Self-Attention-Schichten.", "domain": "technical"}
|
|
6
|
+
{"source": "Please restart the server after the update completes.", "reference": "Bitte starten Sie den Server neu, nachdem das Update abgeschlossen ist.", "domain": "technical"}
|
|
7
|
+
{"source": "I would like a coffee with milk and sugar, please.", "reference": "Ich hätte gerne einen Kaffee mit Milch und Zucker, bitte.", "domain": "conversational"}
|
|
8
|
+
{"source": "Where is the nearest train station?", "reference": "Wo ist der nächste Bahnhof?", "domain": "conversational"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
{"source": "Hello, how are you?", "reference": "Hola, ¿cómo estás?", "domain": "greeting"}
|
|
2
|
+
{"source": "Good morning, my friend.", "reference": "Buenos días, mi amigo.", "domain": "greeting"}
|
|
3
|
+
{"source": "The president signed the new climate agreement yesterday.", "reference": "El presidente firmó ayer el nuevo acuerdo climático.", "domain": "news"}
|
|
4
|
+
{"source": "Stock markets fell sharply after the central bank announcement.", "reference": "Los mercados bursátiles cayeron bruscamente tras el anuncio del banco central.", "domain": "news"}
|
|
5
|
+
{"source": "The transformer architecture uses self-attention layers.", "reference": "La arquitectura transformer utiliza capas de autoatención.", "domain": "technical"}
|
|
6
|
+
{"source": "Please restart the server after the update completes.", "reference": "Por favor, reinicia el servidor cuando termine la actualización.", "domain": "technical"}
|
|
7
|
+
{"source": "I would like a coffee with milk and sugar, please.", "reference": "Quisiera un café con leche y azúcar, por favor.", "domain": "conversational"}
|
|
8
|
+
{"source": "Where is the nearest train station?", "reference": "¿Dónde está la estación de tren más cercana?", "domain": "conversational"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
{"source": "Hello, how are you?", "reference": "Bonjour, comment allez-vous ?", "domain": "greeting"}
|
|
2
|
+
{"source": "Good morning, my friend.", "reference": "Bonjour, mon ami.", "domain": "greeting"}
|
|
3
|
+
{"source": "The president signed the new climate agreement yesterday.", "reference": "Le président a signé le nouvel accord sur le climat hier.", "domain": "news"}
|
|
4
|
+
{"source": "Stock markets fell sharply after the central bank announcement.", "reference": "Les marchés boursiers ont fortement chuté après l'annonce de la banque centrale.", "domain": "news"}
|
|
5
|
+
{"source": "The transformer architecture uses self-attention layers.", "reference": "L'architecture transformeur utilise des couches d'auto-attention.", "domain": "technical"}
|
|
6
|
+
{"source": "Please restart the server after the update completes.", "reference": "Veuillez redémarrer le serveur une fois la mise à jour terminée.", "domain": "technical"}
|
|
7
|
+
{"source": "I would like a coffee with milk and sugar, please.", "reference": "Je voudrais un café avec du lait et du sucre, s'il vous plaît.", "domain": "conversational"}
|
|
8
|
+
{"source": "Where is the nearest train station?", "reference": "Où se trouve la gare la plus proche ?", "domain": "conversational"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
{"source": "Hello, how are you?", "reference": "こんにちは、お元気ですか?", "domain": "greeting"}
|
|
2
|
+
{"source": "Good morning, my friend.", "reference": "おはよう、友よ。", "domain": "greeting"}
|
|
3
|
+
{"source": "The president signed the new climate agreement yesterday.", "reference": "大統領は昨日、新しい気候協定に署名しました。", "domain": "news"}
|
|
4
|
+
{"source": "Stock markets fell sharply after the central bank announcement.", "reference": "中央銀行の発表を受けて、株式市場は急落しました。", "domain": "news"}
|
|
5
|
+
{"source": "The transformer architecture uses self-attention layers.", "reference": "トランスフォーマーアーキテクチャは自己注意層を使用します。", "domain": "technical"}
|
|
6
|
+
{"source": "Please restart the server after the update completes.", "reference": "アップデートが完了したらサーバーを再起動してください。", "domain": "technical"}
|
|
7
|
+
{"source": "I would like a coffee with milk and sugar, please.", "reference": "ミルクと砂糖入りのコーヒーをお願いします。", "domain": "conversational"}
|
|
8
|
+
{"source": "Where is the nearest train station?", "reference": "最寄りの駅はどこですか?", "domain": "conversational"}
|
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
"""LLMMTPlugin — entry point for ``llm.mt`` benchmarks.
|
|
2
|
+
|
|
3
|
+
The fifth-modality sibling to the perf / quality / voice / embeddings plugins.
|
|
4
|
+
Implements the same contract (``list_benchmarks`` / ``get_benchmark`` /
|
|
5
|
+
``validate`` / ``run``) but its headline metric is translation accuracy —
|
|
6
|
+
chrF (character n-gram F-score) by default, with token-BLEU and exact-match
|
|
7
|
+
as alternates. Scoring is deterministic and dependency-free; see
|
|
8
|
+
:mod:`inferencebench_mt.scoring`.
|
|
9
|
+
|
|
10
|
+
The plugin drives a real :class:`ModelClient` per fixture row, constructs an
|
|
11
|
+
MT prompt (``"Translate from {src} to {tgt}: ..."``), and scores the model's
|
|
12
|
+
hypothesis against the bundled reference. Self-hosted OpenAI-compatible
|
|
13
|
+
endpoints (vLLM, SGLang) get the LiteLLM ``openai/`` routing prefix added
|
|
14
|
+
exactly once — same convention as the llm-quality plugin.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import hashlib
|
|
20
|
+
import json
|
|
21
|
+
import math
|
|
22
|
+
import os
|
|
23
|
+
import time
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import TYPE_CHECKING
|
|
26
|
+
|
|
27
|
+
import yaml
|
|
28
|
+
|
|
29
|
+
from inferencebench.envelope import (
|
|
30
|
+
DatasetSpec as EnvDatasetSpec,
|
|
31
|
+
)
|
|
32
|
+
from inferencebench.envelope import (
|
|
33
|
+
EngineConfig,
|
|
34
|
+
Envelope,
|
|
35
|
+
EnvelopeBuilder,
|
|
36
|
+
ModelConfig,
|
|
37
|
+
Quantization,
|
|
38
|
+
SigningMode,
|
|
39
|
+
sign_envelope,
|
|
40
|
+
)
|
|
41
|
+
from inferencebench.harness import (
|
|
42
|
+
CompletionResult,
|
|
43
|
+
ModelClient,
|
|
44
|
+
Sample,
|
|
45
|
+
collect_hardware_fingerprint,
|
|
46
|
+
collect_software_provenance,
|
|
47
|
+
)
|
|
48
|
+
from inferencebench.harness.metrics import EnergyReport, Percentiles, TelemetryWindow
|
|
49
|
+
from inferencebench_mt.schemas import BenchmarkSpec, EngineKind, RunContext
|
|
50
|
+
from inferencebench_mt.scoring import SCORERS
|
|
51
|
+
|
|
52
|
+
if TYPE_CHECKING:
|
|
53
|
+
from collections.abc import Callable
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _json_num(v: float) -> str:
|
|
57
|
+
"""JSON-safe numeric encoder: NaN/inf become null."""
|
|
58
|
+
if isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
|
|
59
|
+
return "null"
|
|
60
|
+
return repr(v)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _json_str(v: str | None) -> str:
|
|
64
|
+
"""JSON-safe string encoder."""
|
|
65
|
+
return json.dumps(v if v is not None else "")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# Engines that require ``base_url`` (self-hosted OpenAI-compatible servers).
|
|
69
|
+
# OPENAI / COHERE here mean "provider-hosted endpoint" — base_url is optional.
|
|
70
|
+
_SELF_HOSTED_ENGINES = frozenset({EngineKind.VLLM, EngineKind.SGLANG})
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _fixtures_cache_root() -> Path:
|
|
74
|
+
"""Resolve the bench-fixtures cache root for ``fixtures://`` dataset URIs."""
|
|
75
|
+
override = os.environ.get("BENCH_FIXTURES_ROOT")
|
|
76
|
+
if override:
|
|
77
|
+
return Path(override)
|
|
78
|
+
return Path.home() / ".cache" / "inferencebench" / "fixtures"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _compute_fixture_hash(items: list[dict[str, str]]) -> str:
|
|
82
|
+
"""SHA-256 over the canonical-JSON-encoded fixture rows."""
|
|
83
|
+
canonical = json.dumps(items, sort_keys=True, separators=(",", ":"))
|
|
84
|
+
return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _build_client(context: RunContext, *, timeout_s: float = 60.0) -> ModelClient:
|
|
88
|
+
"""Build a :class:`ModelClient` from the run context.
|
|
89
|
+
|
|
90
|
+
OpenAI-compatible self-hosted servers (vLLM, SGLang) require the LiteLLM
|
|
91
|
+
``openai/<model>`` routing prefix; we add it here exactly once, stripping
|
|
92
|
+
any user-supplied prefix first so a double ``openai/openai/...`` never
|
|
93
|
+
reaches LiteLLM. Provider-hosted engines (``OPENAI``, ``COHERE``) leave
|
|
94
|
+
the model id untouched.
|
|
95
|
+
"""
|
|
96
|
+
model_id = context.model_id
|
|
97
|
+
api_key: str | None
|
|
98
|
+
if context.engine_kind in _SELF_HOSTED_ENGINES:
|
|
99
|
+
if model_id.startswith("openai/"):
|
|
100
|
+
model_id = model_id[len("openai/") :]
|
|
101
|
+
model_id = f"openai/{model_id}"
|
|
102
|
+
api_key = context.api_key or "EMPTY"
|
|
103
|
+
else:
|
|
104
|
+
api_key = context.api_key or None
|
|
105
|
+
return ModelClient(
|
|
106
|
+
model=model_id,
|
|
107
|
+
api_key=api_key,
|
|
108
|
+
base_url=context.base_url or None,
|
|
109
|
+
timeout_s=timeout_s,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _build_prompt(source: str, source_lang: str, target_lang: str) -> str:
|
|
114
|
+
"""Construct the translation prompt for one fixture row."""
|
|
115
|
+
return f"Translate from {source_lang} to {target_lang}:\n\n{source}\n\nTranslation:"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# Metrics this plugin is expected to emit. Consumed by ``bench coverage``.
|
|
119
|
+
EXPECTED_METRICS: tuple[str, ...] = (
|
|
120
|
+
"chrf_mean",
|
|
121
|
+
"chrf_p50",
|
|
122
|
+
"chrf_p95",
|
|
123
|
+
"ok_rate",
|
|
124
|
+
"n_samples",
|
|
125
|
+
"ttft_p50_ms",
|
|
126
|
+
"total_p50_ms",
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class LLMMTPlugin:
|
|
131
|
+
"""Plugin entry point. Registered via ``inferencebench.plugins`` entrypoint group."""
|
|
132
|
+
|
|
133
|
+
suite_id = "llm.mt"
|
|
134
|
+
version = "0.0.2"
|
|
135
|
+
description = (
|
|
136
|
+
"LLM machine-translation benchmarks (chrF / token-BLEU / exact-match on bundled fixtures)."
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# ----------------------------------------------------------- benchmarks #
|
|
140
|
+
def list_benchmarks(self) -> list[BenchmarkSpec]:
|
|
141
|
+
bench_dir = self._benchmarks_dir()
|
|
142
|
+
specs: list[BenchmarkSpec] = []
|
|
143
|
+
if not bench_dir.exists():
|
|
144
|
+
return specs
|
|
145
|
+
for yml in sorted(bench_dir.glob("*.yaml")):
|
|
146
|
+
specs.append(self._load_yaml(yml))
|
|
147
|
+
return specs
|
|
148
|
+
|
|
149
|
+
def get_benchmark(self, benchmark_id: str) -> BenchmarkSpec:
|
|
150
|
+
for spec in self.list_benchmarks():
|
|
151
|
+
if spec.benchmark_id == benchmark_id:
|
|
152
|
+
return spec
|
|
153
|
+
msg = f"benchmark_id not found: {benchmark_id}"
|
|
154
|
+
raise KeyError(msg)
|
|
155
|
+
|
|
156
|
+
# ------------------------------------------------------------- validate #
|
|
157
|
+
def validate(self, spec: BenchmarkSpec, context: RunContext) -> list[str]:
|
|
158
|
+
warnings: list[str] = []
|
|
159
|
+
if not context.model_id:
|
|
160
|
+
warnings.append("model_id is empty")
|
|
161
|
+
if context.engine_kind in _SELF_HOSTED_ENGINES and not context.base_url:
|
|
162
|
+
warnings.append(
|
|
163
|
+
f"{context.engine_kind.value} needs base_url (e.g. http://localhost:8000/v1)"
|
|
164
|
+
)
|
|
165
|
+
if not self._dataset_path(spec).exists():
|
|
166
|
+
warnings.append(f"fixture not found: {spec.dataset.path}")
|
|
167
|
+
return warnings
|
|
168
|
+
|
|
169
|
+
# ------------------------------------------------------------------ run #
|
|
170
|
+
def run(self, spec: BenchmarkSpec, context: RunContext) -> Envelope:
|
|
171
|
+
"""Execute the benchmark and return a SIGNED envelope."""
|
|
172
|
+
client = _build_client(context)
|
|
173
|
+
items = self._load_fixture(spec)
|
|
174
|
+
fixture_hash = _compute_fixture_hash(items)
|
|
175
|
+
scorer = SCORERS[spec.scoring]
|
|
176
|
+
|
|
177
|
+
samples, scores, telemetry = self._score_items(client, items, spec, scorer)
|
|
178
|
+
|
|
179
|
+
# Best-effort diagnostic dump — never blocks the run on I/O errors.
|
|
180
|
+
self._dump_samples(context, samples)
|
|
181
|
+
|
|
182
|
+
envelope = self._build_envelope(
|
|
183
|
+
spec,
|
|
184
|
+
context,
|
|
185
|
+
samples=samples,
|
|
186
|
+
scores=scores,
|
|
187
|
+
dataset_hash=fixture_hash,
|
|
188
|
+
energy=telemetry.summarise(samples),
|
|
189
|
+
)
|
|
190
|
+
signing_mode = context.extra.get("signing_mode", "dev")
|
|
191
|
+
dev_key_path = context.extra.get("dev_key_path")
|
|
192
|
+
if signing_mode == "dev":
|
|
193
|
+
if not dev_key_path:
|
|
194
|
+
msg = "dev signing requires context.extra['dev_key_path']"
|
|
195
|
+
raise ValueError(msg)
|
|
196
|
+
return sign_envelope(
|
|
197
|
+
envelope,
|
|
198
|
+
mode=SigningMode.DEV,
|
|
199
|
+
dev_key_path=Path(str(dev_key_path)),
|
|
200
|
+
)
|
|
201
|
+
return sign_envelope(envelope, mode=SigningMode.KEYLESS)
|
|
202
|
+
|
|
203
|
+
# -------------------------------------------------------- core scoring #
|
|
204
|
+
def _score_items(
|
|
205
|
+
self,
|
|
206
|
+
client: ModelClient,
|
|
207
|
+
items: list[dict[str, str]],
|
|
208
|
+
spec: BenchmarkSpec,
|
|
209
|
+
scorer: Callable[[str, str], float],
|
|
210
|
+
) -> tuple[list[Sample], list[float], TelemetryWindow]:
|
|
211
|
+
"""Iterate fixture items, call the model, score each hypothesis.
|
|
212
|
+
|
|
213
|
+
MT runs are per-sentence and order-independent — no driver machinery
|
|
214
|
+
is required. We still emit ``Sample`` objects so the envelope-building
|
|
215
|
+
path stays uniform with the perf plugin.
|
|
216
|
+
"""
|
|
217
|
+
samples: list[Sample] = []
|
|
218
|
+
scores: list[float] = []
|
|
219
|
+
telemetry = TelemetryWindow()
|
|
220
|
+
with telemetry:
|
|
221
|
+
for idx, item in enumerate(items):
|
|
222
|
+
source = item["source"]
|
|
223
|
+
reference = item["reference"]
|
|
224
|
+
prompt = _build_prompt(source, spec.source_lang, spec.target_lang)
|
|
225
|
+
t_arrival = time.perf_counter() * 1000.0
|
|
226
|
+
try:
|
|
227
|
+
result: CompletionResult = client.complete(
|
|
228
|
+
prompt, stream=True, max_tokens=256
|
|
229
|
+
)
|
|
230
|
+
except Exception as exc:
|
|
231
|
+
samples.append(
|
|
232
|
+
Sample(
|
|
233
|
+
request_idx=idx,
|
|
234
|
+
arrival_ms=t_arrival,
|
|
235
|
+
start_ms=t_arrival,
|
|
236
|
+
ttft_ms=float("nan"),
|
|
237
|
+
total_ms=float("nan"),
|
|
238
|
+
tpot_ms=float("nan"),
|
|
239
|
+
tokens_in=0,
|
|
240
|
+
tokens_out=0,
|
|
241
|
+
cost_usd=0.0,
|
|
242
|
+
finish_reason="error",
|
|
243
|
+
ok=False,
|
|
244
|
+
error=str(exc),
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
score = float(scorer(reference, result.text))
|
|
250
|
+
scores.append(score)
|
|
251
|
+
|
|
252
|
+
sample_extra: dict[str, str | int | float | bool] = {
|
|
253
|
+
"domain": item.get("domain", ""),
|
|
254
|
+
"score": score,
|
|
255
|
+
}
|
|
256
|
+
samples.append(
|
|
257
|
+
Sample(
|
|
258
|
+
request_idx=idx,
|
|
259
|
+
arrival_ms=t_arrival,
|
|
260
|
+
start_ms=t_arrival,
|
|
261
|
+
ttft_ms=result.ttft_ms,
|
|
262
|
+
total_ms=result.total_ms,
|
|
263
|
+
tpot_ms=result.tpot_ms,
|
|
264
|
+
tokens_in=result.tokens_in,
|
|
265
|
+
tokens_out=result.tokens_out,
|
|
266
|
+
cost_usd=result.cost_usd,
|
|
267
|
+
finish_reason=result.finish_reason,
|
|
268
|
+
ok=True,
|
|
269
|
+
extra=sample_extra,
|
|
270
|
+
)
|
|
271
|
+
)
|
|
272
|
+
return samples, scores, telemetry
|
|
273
|
+
|
|
274
|
+
# ------------------------------------------------------------ samples #
|
|
275
|
+
def _dump_samples(self, context: RunContext, samples: list[Sample]) -> None:
|
|
276
|
+
"""Write per-request samples (incl. score) to ``<output_dir>/samples-<ts>.jsonl``.
|
|
277
|
+
|
|
278
|
+
Mirrors the llm-quality plugin's diagnostic dump — failures here
|
|
279
|
+
never block the run.
|
|
280
|
+
"""
|
|
281
|
+
try:
|
|
282
|
+
out_dir = Path(context.output_dir)
|
|
283
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
284
|
+
ts = int(time.time())
|
|
285
|
+
path = out_dir / f"samples-{ts}.jsonl"
|
|
286
|
+
with path.open("w", encoding="utf-8") as fp:
|
|
287
|
+
for s in samples:
|
|
288
|
+
score = s.extra.get("score") if s.extra else None
|
|
289
|
+
score_part = (
|
|
290
|
+
',"score":' + _json_num(float(score))
|
|
291
|
+
if isinstance(score, (int, float))
|
|
292
|
+
else ""
|
|
293
|
+
)
|
|
294
|
+
fp.write(
|
|
295
|
+
'{"request_idx":'
|
|
296
|
+
+ str(s.request_idx)
|
|
297
|
+
+ ',"ok":'
|
|
298
|
+
+ ("true" if s.ok else "false")
|
|
299
|
+
+ ',"ttft_ms":'
|
|
300
|
+
+ _json_num(s.ttft_ms)
|
|
301
|
+
+ ',"total_ms":'
|
|
302
|
+
+ _json_num(s.total_ms)
|
|
303
|
+
+ ',"tokens_in":'
|
|
304
|
+
+ str(s.tokens_in)
|
|
305
|
+
+ ',"tokens_out":'
|
|
306
|
+
+ str(s.tokens_out)
|
|
307
|
+
+ score_part
|
|
308
|
+
+ ',"finish_reason":"'
|
|
309
|
+
+ (s.finish_reason or "")
|
|
310
|
+
+ '"'
|
|
311
|
+
+ (',"error":' + _json_str(s.error) if s.error else "")
|
|
312
|
+
+ "}\n"
|
|
313
|
+
)
|
|
314
|
+
except OSError:
|
|
315
|
+
pass # diagnostics-only — never block the run
|
|
316
|
+
|
|
317
|
+
# ---------------------------------------------------------- file paths #
|
|
318
|
+
def _benchmarks_dir(self) -> Path:
|
|
319
|
+
return Path(__file__).parent / "benchmarks"
|
|
320
|
+
|
|
321
|
+
def _datasets_dir(self) -> Path:
|
|
322
|
+
return Path(__file__).parent / "datasets"
|
|
323
|
+
|
|
324
|
+
def _dataset_path(self, spec: BenchmarkSpec) -> Path:
|
|
325
|
+
raw = spec.dataset.path
|
|
326
|
+
if raw.startswith("fixtures://"):
|
|
327
|
+
return _fixtures_cache_root() / f"{raw[len('fixtures://') :]}.jsonl"
|
|
328
|
+
return self._datasets_dir() / raw
|
|
329
|
+
|
|
330
|
+
def _load_yaml(self, path: Path) -> BenchmarkSpec:
|
|
331
|
+
raw = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
|
332
|
+
return BenchmarkSpec.model_validate(raw)
|
|
333
|
+
|
|
334
|
+
def _load_fixture(self, spec: BenchmarkSpec) -> list[dict[str, str]]:
|
|
335
|
+
path = self._dataset_path(spec)
|
|
336
|
+
if not path.exists():
|
|
337
|
+
if spec.dataset.path.startswith("fixtures://"):
|
|
338
|
+
key = spec.dataset.path[len("fixtures://") :]
|
|
339
|
+
msg = f"fixture not cached: {path}. Run `bench fixtures fetch {key}` first."
|
|
340
|
+
raise FileNotFoundError(msg)
|
|
341
|
+
msg = f"fixture not found: {path}"
|
|
342
|
+
raise FileNotFoundError(msg)
|
|
343
|
+
items: list[dict[str, str]] = []
|
|
344
|
+
with path.open("r", encoding="utf-8") as fp:
|
|
345
|
+
for line in fp:
|
|
346
|
+
line = line.strip()
|
|
347
|
+
if not line:
|
|
348
|
+
continue
|
|
349
|
+
obj = json.loads(line)
|
|
350
|
+
if not isinstance(obj, dict):
|
|
351
|
+
continue
|
|
352
|
+
if "source" not in obj or "reference" not in obj:
|
|
353
|
+
continue
|
|
354
|
+
items.append(
|
|
355
|
+
{
|
|
356
|
+
"source": str(obj["source"]),
|
|
357
|
+
"reference": str(obj["reference"]),
|
|
358
|
+
"domain": str(obj.get("domain", "")),
|
|
359
|
+
}
|
|
360
|
+
)
|
|
361
|
+
if not items:
|
|
362
|
+
msg = f"fixture is empty: {path}"
|
|
363
|
+
raise ValueError(msg)
|
|
364
|
+
return items
|
|
365
|
+
|
|
366
|
+
# ---------------------------------------------------------- envelope #
|
|
367
|
+
def _build_envelope(
|
|
368
|
+
self,
|
|
369
|
+
spec: BenchmarkSpec,
|
|
370
|
+
context: RunContext,
|
|
371
|
+
*,
|
|
372
|
+
samples: list[Sample],
|
|
373
|
+
scores: list[float],
|
|
374
|
+
dataset_hash: str,
|
|
375
|
+
energy: EnergyReport | None = None,
|
|
376
|
+
) -> Envelope:
|
|
377
|
+
hw = collect_hardware_fingerprint()
|
|
378
|
+
sw = collect_software_provenance()
|
|
379
|
+
|
|
380
|
+
metrics: dict[str, float | int | str | None] = {}
|
|
381
|
+
|
|
382
|
+
ok_samples = [s for s in samples if s.ok]
|
|
383
|
+
n_ok = len(ok_samples)
|
|
384
|
+
metrics["n_samples"] = float(len(samples))
|
|
385
|
+
metrics["n_ok"] = float(n_ok)
|
|
386
|
+
metrics["ok_rate"] = float(n_ok) / float(len(samples)) if samples else 0.0
|
|
387
|
+
|
|
388
|
+
# Headline scoring metric — keyed by the spec's scoring strategy so
|
|
389
|
+
# downstream ``bench diff`` knows the direction (all three MT scorers
|
|
390
|
+
# are higher-is-better).
|
|
391
|
+
if scores:
|
|
392
|
+
mean_score = sum(scores) / len(scores)
|
|
393
|
+
if spec.scoring == "exact_match":
|
|
394
|
+
metrics["exact_match_rate"] = mean_score
|
|
395
|
+
else:
|
|
396
|
+
prefix = spec.scoring # "chrf" | "bleu_token"
|
|
397
|
+
# Keep the metric key short for BLEU (``bleu_mean``, not
|
|
398
|
+
# ``bleu_token_mean``) so diff's _HIGHER_IS_BETTER policy and
|
|
399
|
+
# the leaderboard column headers stay readable.
|
|
400
|
+
key_prefix = "bleu" if prefix == "bleu_token" else prefix
|
|
401
|
+
metrics[f"{key_prefix}_mean"] = mean_score
|
|
402
|
+
if len(scores) >= 2:
|
|
403
|
+
pcts = Percentiles(scores, percentiles=(50.0, 95.0))
|
|
404
|
+
metrics[f"{key_prefix}_p50"] = pcts.p50
|
|
405
|
+
metrics[f"{key_prefix}_p95"] = pcts.p95
|
|
406
|
+
else:
|
|
407
|
+
metrics[f"{key_prefix}_p50"] = mean_score
|
|
408
|
+
metrics[f"{key_prefix}_p95"] = mean_score
|
|
409
|
+
|
|
410
|
+
# Latency aggregates — "quality at what cost" comparisons.
|
|
411
|
+
ttft_vals = [s.ttft_ms for s in ok_samples if math.isfinite(s.ttft_ms)]
|
|
412
|
+
total_vals = [s.total_ms for s in ok_samples if math.isfinite(s.total_ms)]
|
|
413
|
+
if ttft_vals:
|
|
414
|
+
metrics["ttft_p50_ms"] = Percentiles(ttft_vals).p50
|
|
415
|
+
if total_vals:
|
|
416
|
+
metrics["total_p50_ms"] = Percentiles(total_vals).p50
|
|
417
|
+
|
|
418
|
+
tokens_out_total = sum(s.tokens_out for s in ok_samples)
|
|
419
|
+
if tokens_out_total:
|
|
420
|
+
metrics["tokens_out_total"] = float(tokens_out_total)
|
|
421
|
+
|
|
422
|
+
# Energy / power summary from telemetry (None on plugins that haven't
|
|
423
|
+
# threaded a TelemetryWindow through yet). Mirrors llm-inference.
|
|
424
|
+
if energy is not None:
|
|
425
|
+
if energy.gpu_power_avg_w > 0:
|
|
426
|
+
metrics["power_avg_w"] = energy.gpu_power_avg_w
|
|
427
|
+
metrics["power_peak_w"] = energy.gpu_power_peak_w
|
|
428
|
+
if energy.total_energy_joules > 0:
|
|
429
|
+
metrics["energy_joules_total"] = energy.total_energy_joules
|
|
430
|
+
if energy.joules_per_token == energy.joules_per_token: # not NaN
|
|
431
|
+
metrics["joules_per_token"] = energy.joules_per_token
|
|
432
|
+
|
|
433
|
+
# Cost: only emit when the provider actually reported it. Self-hosted
|
|
434
|
+
# vLLM / SGLang never do; the perf plugin's pricing-registry fallback
|
|
435
|
+
# is intentionally NOT mirrored here — MT runs are cheap enough
|
|
436
|
+
# that a missing-cost row is more honest than an estimated one.
|
|
437
|
+
cost_total = sum(s.cost_usd for s in ok_samples)
|
|
438
|
+
if tokens_out_total and cost_total > 0:
|
|
439
|
+
metrics["cost_usd_per_million_tokens"] = (cost_total / tokens_out_total) * 1e6
|
|
440
|
+
metrics["cost_source"] = "provider"
|
|
441
|
+
|
|
442
|
+
builder = EnvelopeBuilder(
|
|
443
|
+
suite_id=spec.benchmark_id,
|
|
444
|
+
suite_version=spec.suite_version,
|
|
445
|
+
model=ModelConfig(
|
|
446
|
+
id=context.model_id,
|
|
447
|
+
revision=context.model_revision,
|
|
448
|
+
provider=context.engine_kind.value,
|
|
449
|
+
endpoint_hash="0" * 64,
|
|
450
|
+
),
|
|
451
|
+
engine=EngineConfig(
|
|
452
|
+
name=context.engine_kind.value,
|
|
453
|
+
version=context.engine_version or "unknown",
|
|
454
|
+
config_hash="0" * 64,
|
|
455
|
+
),
|
|
456
|
+
hardware_fingerprint=hw,
|
|
457
|
+
software_provenance=sw,
|
|
458
|
+
dataset=EnvDatasetSpec(id=spec.dataset.id, hash=dataset_hash),
|
|
459
|
+
seed=0,
|
|
460
|
+
quantization=(
|
|
461
|
+
Quantization(format=context.quantization_format)
|
|
462
|
+
if context.quantization_format
|
|
463
|
+
else None
|
|
464
|
+
),
|
|
465
|
+
metrics=metrics,
|
|
466
|
+
slo_template=spec.slo_template,
|
|
467
|
+
)
|
|
468
|
+
return builder.build()
|
|
File without changes
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Pydantic schemas for llm-mt benchmark specs + run context."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import StrEnum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Annotated, Literal
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EngineKind(StrEnum):
|
|
13
|
+
"""Engines this plugin can drive.
|
|
14
|
+
|
|
15
|
+
Machine translation is dominated by per-prompt API calls, so the four
|
|
16
|
+
most useful endpoints are self-hosted OpenAI-compatible servers (vLLM,
|
|
17
|
+
SGLang), provider-hosted OpenAI, and Cohere — whose Aya / Command
|
|
18
|
+
models are popular MT picks.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
VLLM = "vllm"
|
|
22
|
+
SGLANG = "sglang"
|
|
23
|
+
OPENAI = "openai"
|
|
24
|
+
COHERE = "cohere"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DatasetConfig(BaseModel):
|
|
28
|
+
"""Dataset under evaluation.
|
|
29
|
+
|
|
30
|
+
For the MT plugin the dataset is a small bundled JSONL fixture with one
|
|
31
|
+
``{"source", "reference", "domain"}`` object per line.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
model_config = ConfigDict(extra="forbid")
|
|
35
|
+
id: Annotated[str, Field(min_length=1)]
|
|
36
|
+
path: Annotated[
|
|
37
|
+
str,
|
|
38
|
+
Field(
|
|
39
|
+
min_length=1,
|
|
40
|
+
description=("Path to the fixture JSONL relative to the plugin's datasets/ directory."),
|
|
41
|
+
),
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class WarmupConfig(BaseModel):
|
|
46
|
+
"""Warmup parameters.
|
|
47
|
+
|
|
48
|
+
MT scoring is per-sentence and order-independent, so the default is
|
|
49
|
+
zero discarded runs. Surfaced as a knob so future revisions can warm
|
|
50
|
+
up server-side weights if needed.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
model_config = ConfigDict(extra="forbid")
|
|
54
|
+
discard_runs: Annotated[int, Field(ge=0)] = 0
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class BenchmarkSpec(BaseModel):
|
|
58
|
+
"""One MT benchmark — fixture + scoring strategy + language pair + metadata."""
|
|
59
|
+
|
|
60
|
+
model_config = ConfigDict(extra="forbid")
|
|
61
|
+
benchmark_id: Annotated[str, Field(min_length=1)]
|
|
62
|
+
suite_version: Annotated[str, Field(pattern=r"^\d+\.\d+\.\d+(-[\w.]+)?$")]
|
|
63
|
+
description: str = ""
|
|
64
|
+
modality: Literal["llm"] = "llm"
|
|
65
|
+
kind: Literal["translation"] = "translation"
|
|
66
|
+
dataset: DatasetConfig
|
|
67
|
+
slo_template: str = "llm.mt.standard"
|
|
68
|
+
warmup: WarmupConfig = Field(default_factory=WarmupConfig)
|
|
69
|
+
scoring: Literal["chrf", "bleu_token", "exact_match"] = "chrf"
|
|
70
|
+
source_lang: Annotated[str, Field(min_length=2, max_length=8)]
|
|
71
|
+
target_lang: Annotated[str, Field(min_length=2, max_length=8)]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class RunContext(BaseModel):
|
|
75
|
+
"""Per-invocation context (where to send requests, where to write results).
|
|
76
|
+
|
|
77
|
+
Mirrors the llm-quality plugin shape so cross-plugin tooling can reuse
|
|
78
|
+
the same context object.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
|
|
82
|
+
model_id: Annotated[str, Field(min_length=1)]
|
|
83
|
+
model_revision: Annotated[str, Field(min_length=7, max_length=40)] = "unknown00"
|
|
84
|
+
engine_kind: EngineKind
|
|
85
|
+
engine_version: str = ""
|
|
86
|
+
base_url: str = ""
|
|
87
|
+
api_key: str = ""
|
|
88
|
+
quantization_format: str = ""
|
|
89
|
+
hardware_class: str = ""
|
|
90
|
+
output_dir: Path
|
|
91
|
+
extra: dict[str, str | int | float | bool] = Field(default_factory=dict)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Deterministic scoring strategies for the llm-mt plugin.
|
|
2
|
+
|
|
3
|
+
Three pure functions, each ``(reference, hypothesis) -> float`` in ``[0.0, 1.0]``:
|
|
4
|
+
|
|
5
|
+
- :func:`chrf` — character n-gram F-score (the standard chrF metric).
|
|
6
|
+
- :func:`bleu_token` — corpus-free token BLEU with brevity penalty.
|
|
7
|
+
- :func:`exact_match` — strict strip + lowercase equality (mirrors llm-quality).
|
|
8
|
+
|
|
9
|
+
All three are higher-is-better — translation accuracy in :math:`[0, 1]`. No
|
|
10
|
+
external dependencies. Whitespace is normalised by collapsing runs before
|
|
11
|
+
character-n-gram extraction so chrF behaves the same on ``"hello world"``
|
|
12
|
+
and ``"hello world"``.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import math
|
|
18
|
+
from collections import Counter
|
|
19
|
+
from collections.abc import Callable
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _char_ngrams(text: str, n: int) -> Counter[str]:
|
|
23
|
+
"""Return the multiset of character n-grams of length ``n`` in ``text``.
|
|
24
|
+
|
|
25
|
+
Whitespace is collapsed (any run of whitespace becomes a single space) so
|
|
26
|
+
formatting noise does not perturb the score. ``n`` must be at least 1.
|
|
27
|
+
"""
|
|
28
|
+
normalised = " ".join(text.split())
|
|
29
|
+
if len(normalised) < n:
|
|
30
|
+
return Counter()
|
|
31
|
+
return Counter(normalised[i : i + n] for i in range(len(normalised) - n + 1))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def chrf(reference: str, hypothesis: str, n: int = 6, beta: float = 2.0) -> float:
|
|
35
|
+
"""Character n-gram F-score (chrF).
|
|
36
|
+
|
|
37
|
+
Collects character n-grams up to length ``n`` in both ``reference`` and
|
|
38
|
+
``hypothesis``, computes a single precision / recall / F_beta over the
|
|
39
|
+
union of all n-gram orders, and returns the result in ``[0, 1]``. This
|
|
40
|
+
is the simplest, hand-computable variant of the metric: order-uniform
|
|
41
|
+
weighting, no separate word-vs-char split.
|
|
42
|
+
|
|
43
|
+
Returns 1.0 when reference and hypothesis are identical (after
|
|
44
|
+
whitespace normalisation) and 0.0 when no n-gram of any order overlaps.
|
|
45
|
+
An empty reference and hypothesis match exactly (returns 1.0); only one
|
|
46
|
+
side empty returns 0.0.
|
|
47
|
+
"""
|
|
48
|
+
if n < 1:
|
|
49
|
+
msg = "chrf requires n >= 1"
|
|
50
|
+
raise ValueError(msg)
|
|
51
|
+
ref_norm = " ".join(reference.split())
|
|
52
|
+
hyp_norm = " ".join(hypothesis.split())
|
|
53
|
+
if not ref_norm and not hyp_norm:
|
|
54
|
+
return 1.0
|
|
55
|
+
if not ref_norm or not hyp_norm:
|
|
56
|
+
return 0.0
|
|
57
|
+
|
|
58
|
+
total_match = 0
|
|
59
|
+
total_hyp = 0
|
|
60
|
+
total_ref = 0
|
|
61
|
+
for order in range(1, n + 1):
|
|
62
|
+
ref_ngrams = _char_ngrams(reference, order)
|
|
63
|
+
hyp_ngrams = _char_ngrams(hypothesis, order)
|
|
64
|
+
if not ref_ngrams or not hyp_ngrams:
|
|
65
|
+
continue
|
|
66
|
+
overlap = sum((ref_ngrams & hyp_ngrams).values())
|
|
67
|
+
total_match += overlap
|
|
68
|
+
total_hyp += sum(hyp_ngrams.values())
|
|
69
|
+
total_ref += sum(ref_ngrams.values())
|
|
70
|
+
|
|
71
|
+
if total_hyp == 0 or total_ref == 0 or total_match == 0:
|
|
72
|
+
return 0.0
|
|
73
|
+
precision = total_match / total_hyp
|
|
74
|
+
recall = total_match / total_ref
|
|
75
|
+
beta_sq = beta * beta
|
|
76
|
+
denom = beta_sq * precision + recall
|
|
77
|
+
if denom == 0:
|
|
78
|
+
return 0.0
|
|
79
|
+
return (1.0 + beta_sq) * precision * recall / denom
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def bleu_token(reference: str, hypothesis: str, max_n: int = 4) -> float:
|
|
83
|
+
"""Simple corpus-free BLEU over whitespace-split tokens.
|
|
84
|
+
|
|
85
|
+
Computes modified n-gram precisions for ``n`` in ``1..max_n``, takes
|
|
86
|
+
their geometric mean, and multiplies by the standard brevity penalty
|
|
87
|
+
``exp(min(0, 1 - r/c))``. Returns 0.0 when any n-gram order has zero
|
|
88
|
+
precision (the canonical BLEU behaviour — no smoothing here, since the
|
|
89
|
+
metric is informational only for the skeleton). Result is in ``[0, 1]``.
|
|
90
|
+
"""
|
|
91
|
+
if max_n < 1:
|
|
92
|
+
msg = "bleu_token requires max_n >= 1"
|
|
93
|
+
raise ValueError(msg)
|
|
94
|
+
hyp_tokens = hypothesis.split()
|
|
95
|
+
ref_tokens = reference.split()
|
|
96
|
+
if not hyp_tokens or not ref_tokens:
|
|
97
|
+
return 0.0
|
|
98
|
+
|
|
99
|
+
precisions: list[float] = []
|
|
100
|
+
for n in range(1, max_n + 1):
|
|
101
|
+
if len(hyp_tokens) < n or len(ref_tokens) < n:
|
|
102
|
+
return 0.0
|
|
103
|
+
hyp_ngrams = Counter(tuple(hyp_tokens[i : i + n]) for i in range(len(hyp_tokens) - n + 1))
|
|
104
|
+
ref_ngrams = Counter(tuple(ref_tokens[i : i + n]) for i in range(len(ref_tokens) - n + 1))
|
|
105
|
+
# Modified count: cap each n-gram by its reference count.
|
|
106
|
+
clipped = sum(min(c, ref_ngrams[g]) for g, c in hyp_ngrams.items())
|
|
107
|
+
total = sum(hyp_ngrams.values())
|
|
108
|
+
if total == 0 or clipped == 0:
|
|
109
|
+
return 0.0
|
|
110
|
+
precisions.append(clipped / total)
|
|
111
|
+
|
|
112
|
+
# Geometric mean of precisions.
|
|
113
|
+
log_sum = sum(math.log(p) for p in precisions)
|
|
114
|
+
geo_mean = math.exp(log_sum / len(precisions))
|
|
115
|
+
|
|
116
|
+
c = len(hyp_tokens)
|
|
117
|
+
r = len(ref_tokens)
|
|
118
|
+
brevity = 1.0 if c > r else math.exp(1.0 - r / c)
|
|
119
|
+
return brevity * geo_mean
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def exact_match(reference: str, hypothesis: str) -> float:
|
|
123
|
+
"""Return 1.0 iff ``hypothesis`` equals ``reference`` after strip + lowercase."""
|
|
124
|
+
return 1.0 if hypothesis.strip().lower() == reference.strip().lower() else 0.0
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
SCORERS: dict[str, Callable[[str, str], float]] = {
|
|
128
|
+
"chrf": chrf,
|
|
129
|
+
"bleu_token": bleu_token,
|
|
130
|
+
"exact_match": exact_match,
|
|
131
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: inferencebench-mt
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Machine-translation plugin for InferenceBench Suite (chrF/BLEU/exact-match on bundled fixtures)
|
|
5
|
+
Project-URL: Homepage, https://github.com/yobitelcomm/bench
|
|
6
|
+
Author-email: Yobitel Communications <bench@yobitel.com>
|
|
7
|
+
License: Apache-2.0
|
|
8
|
+
Keywords: ai,benchmark,bleu,chrf,llm,ml,translation
|
|
9
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.12
|
|
17
|
+
Requires-Dist: inferencebench-envelope
|
|
18
|
+
Requires-Dist: inferencebench-harness
|
|
19
|
+
Requires-Dist: pydantic~=2.9
|
|
20
|
+
Requires-Dist: pyyaml~=6.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# inferencebench-mt
|
|
24
|
+
|
|
25
|
+
Machine-translation plugin for the InferenceBench Suite.
|
|
26
|
+
|
|
27
|
+
Scores model translations against bundled reference fixtures using chrF (character
|
|
28
|
+
n-gram F-score), token-level BLEU, or exact match. Mirrors the contract of the
|
|
29
|
+
other plugins (`list_benchmarks` / `get_benchmark` / `validate` / `run`) and
|
|
30
|
+
emits the canonical signed envelope.
|
|
31
|
+
|
|
32
|
+
Two bundled benchmarks ship out of the box:
|
|
33
|
+
|
|
34
|
+
- `llm.mt.flores-200-mini-en-fr` — FLORES-200-style English to French, chrF.
|
|
35
|
+
- `llm.mt.flores-200-mini-en-de` — FLORES-200-style English to German, chrF.
|
|
36
|
+
|
|
37
|
+
The fixtures are tiny (eight rows each, mixed across greeting / news / technical
|
|
38
|
+
/ conversational domains) — intended for skeleton verification, not headline
|
|
39
|
+
numbers.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
inferencebench_mt/__init__.py,sha256=bAb5E7Q5wiomAtN_QWcYEQxj7bL_DY4HVidDUHr4WVY,312
|
|
2
|
+
inferencebench_mt/plugin.py,sha256=5S-Hdzymsiivc6YDR5s-ogaYj0esvtKU9LJ15TkIZ08,18607
|
|
3
|
+
inferencebench_mt/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
inferencebench_mt/schemas.py,sha256=tpJ206CggNjdX2VEJZyfOEwNEhoPpFdBlDeNsU6M8I0,2943
|
|
5
|
+
inferencebench_mt/scoring.py,sha256=_z80lCIUrJ3Uy___xG9wiFzP_dW3ejEVFylKrm573bA,4998
|
|
6
|
+
inferencebench_mt/benchmarks/flores-200-mini-en-de.yaml,sha256=kUxm5fJOJ_a3QUUAiUmNGxZuSXW7-Sa0jir7a53vdEA,339
|
|
7
|
+
inferencebench_mt/benchmarks/flores-200-mini-en-es.yaml,sha256=lkCSbufCcn9lvPoKxQ1HJrRrOLHdWWiu4pScxyrLOts,340
|
|
8
|
+
inferencebench_mt/benchmarks/flores-200-mini-en-fr.yaml,sha256=BEnt14wkrwfBV1-ZrD9vli39x4zV6E4CQf_eOebl7jg,339
|
|
9
|
+
inferencebench_mt/benchmarks/flores-200-mini-en-ja.yaml,sha256=oCGXI2NuZI658Z83Azo4CqCHVjguXo5pl9p2zWCRkp8,341
|
|
10
|
+
inferencebench_mt/datasets/flores-mini-en-de.jsonl,sha256=ZweW2R0Ku98t8NBrFJP-eiVIgORVZ6TaJMLjtNpIjN8,1194
|
|
11
|
+
inferencebench_mt/datasets/flores-mini-en-es.jsonl,sha256=z9i5MYZGNeZSnSEeqjHPvT45kKZ5yTJ8QBto3EzdqwY,1194
|
|
12
|
+
inferencebench_mt/datasets/flores-mini-en-fr.jsonl,sha256=tmp6wfNyvAGgMJJsec7U4pP5TXhKp0j8bBFLn3mq8LY,1217
|
|
13
|
+
inferencebench_mt/datasets/flores-mini-en-ja.jsonl,sha256=s9mOfQN2WIwf-o-bbRaLl2jM9-QtwNwynmIc0xy3Cl4,1263
|
|
14
|
+
inferencebench_mt-0.0.2.dist-info/METADATA,sha256=jRL7nbrmomSuHuuE5xMWelnJ1lGbFK_VSb5nkU3_6B0,1657
|
|
15
|
+
inferencebench_mt-0.0.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
16
|
+
inferencebench_mt-0.0.2.dist-info/entry_points.txt,sha256=vsbyHlBHcuBrS1IEEg2YJRNR5cMOQs2IX8PoRO3pALw,71
|
|
17
|
+
inferencebench_mt-0.0.2.dist-info/RECORD,,
|