inferencebench-mt 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inferencebench_mt-0.0.2/.gitignore +137 -0
- inferencebench_mt-0.0.2/PKG-INFO +39 -0
- inferencebench_mt-0.0.2/README.md +17 -0
- inferencebench_mt-0.0.2/pyproject.toml +43 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/__init__.py +12 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/benchmarks/flores-200-mini-en-de.yaml +14 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/benchmarks/flores-200-mini-en-es.yaml +14 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/benchmarks/flores-200-mini-en-fr.yaml +14 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/benchmarks/flores-200-mini-en-ja.yaml +14 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/datasets/flores-mini-en-de.jsonl +8 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/datasets/flores-mini-en-es.jsonl +8 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/datasets/flores-mini-en-fr.jsonl +8 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/datasets/flores-mini-en-ja.jsonl +8 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/plugin.py +468 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/py.typed +0 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/schemas.py +91 -0
- inferencebench_mt-0.0.2/src/inferencebench_mt/scoring.py +131 -0
- inferencebench_mt-0.0.2/tests/conftest.py +70 -0
- inferencebench_mt-0.0.2/tests/test_mt_plugin.py +296 -0
- inferencebench_mt-0.0.2/tests/test_mt_scoring.py +117 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
share/python-wheels/
|
|
20
|
+
*.egg-info/
|
|
21
|
+
.installed.cfg
|
|
22
|
+
*.egg
|
|
23
|
+
MANIFEST
|
|
24
|
+
|
|
25
|
+
# uv / virtualenv
|
|
26
|
+
.venv/
|
|
27
|
+
venv/
|
|
28
|
+
env/
|
|
29
|
+
ENV/
|
|
30
|
+
uv.lock.tmp
|
|
31
|
+
.python-version
|
|
32
|
+
|
|
33
|
+
# Testing / coverage
|
|
34
|
+
.tox/
|
|
35
|
+
.nox/
|
|
36
|
+
.coverage
|
|
37
|
+
.coverage.*
|
|
38
|
+
.cache
|
|
39
|
+
nosetests.xml
|
|
40
|
+
coverage.xml
|
|
41
|
+
*.cover
|
|
42
|
+
*.py,cover
|
|
43
|
+
.hypothesis/
|
|
44
|
+
.pytest_cache/
|
|
45
|
+
cover/
|
|
46
|
+
htmlcov/
|
|
47
|
+
|
|
48
|
+
# Type checking
|
|
49
|
+
.mypy_cache/
|
|
50
|
+
.dmypy.json
|
|
51
|
+
dmypy.json
|
|
52
|
+
.pyre/
|
|
53
|
+
.pytype/
|
|
54
|
+
|
|
55
|
+
# Ruff
|
|
56
|
+
.ruff_cache/
|
|
57
|
+
|
|
58
|
+
# IDE / editor
|
|
59
|
+
.idea/
|
|
60
|
+
.vscode/
|
|
61
|
+
*.swp
|
|
62
|
+
*.swo
|
|
63
|
+
*~
|
|
64
|
+
.DS_Store
|
|
65
|
+
|
|
66
|
+
# OS
|
|
67
|
+
Thumbs.db
|
|
68
|
+
desktop.ini
|
|
69
|
+
|
|
70
|
+
# Secrets / env
|
|
71
|
+
.env
|
|
72
|
+
.env.*
|
|
73
|
+
!.env.example
|
|
74
|
+
.envrc
|
|
75
|
+
|
|
76
|
+
# Bench-specific local caches
|
|
77
|
+
~/.cache/inferencebench/
|
|
78
|
+
.cache/inferencebench/
|
|
79
|
+
.inferencebench/
|
|
80
|
+
|
|
81
|
+
# Sigstore dev keys (never commit private keys)
|
|
82
|
+
cosign.key
|
|
83
|
+
cosign-*.key
|
|
84
|
+
cosign-*.pub
|
|
85
|
+
.bench/*.key
|
|
86
|
+
# Local benchmark working dirs (kept local; published outputs land under validation-runs/)
|
|
87
|
+
envelopes-voice/
|
|
88
|
+
envelopes-*/
|
|
89
|
+
*.pem
|
|
90
|
+
!tests/fixtures/**/*.pem
|
|
91
|
+
|
|
92
|
+
# Real-GPU validation artifacts (kept locally, never pushed)
|
|
93
|
+
# Use slash-star (not trailing slash) so individual subpaths can be re-included below.
|
|
94
|
+
validation-runs/*
|
|
95
|
+
# ...except the canonical published marathon corpus — small, public, used by docs + CI
|
|
96
|
+
!validation-runs/2026-05-18-multi-vendor-marathon
|
|
97
|
+
validation-runs/2026-05-18-multi-vendor-marathon/*
|
|
98
|
+
!validation-runs/2026-05-18-multi-vendor-marathon/marathon
|
|
99
|
+
validation-runs/2026-05-18-multi-vendor-marathon/marathon/*
|
|
100
|
+
!validation-runs/2026-05-18-multi-vendor-marathon/marathon/all
|
|
101
|
+
!validation-runs/2026-05-18-multi-vendor-marathon/marathon/all/*.json
|
|
102
|
+
# Voice ASR validation envelopes (small, public, used by leaderboard build)
|
|
103
|
+
!validation-runs/2026-05-25-voice-rtx4000ada
|
|
104
|
+
!validation-runs/2026-05-25-voice-rtx4000ada/*.json
|
|
105
|
+
!validation-runs/2026-05-29-voice-testbm-h100
|
|
106
|
+
!validation-runs/2026-05-29-voice-testbm-h100/*.json
|
|
107
|
+
|
|
108
|
+
# Model weights / datasets (use Git LFS or S3)
|
|
109
|
+
*.bin
|
|
110
|
+
*.safetensors
|
|
111
|
+
*.pt
|
|
112
|
+
*.pth
|
|
113
|
+
*.gguf
|
|
114
|
+
*.onnx
|
|
115
|
+
*.parquet
|
|
116
|
+
!tests/fixtures/**/*.parquet
|
|
117
|
+
|
|
118
|
+
# Logs
|
|
119
|
+
*.log
|
|
120
|
+
logs/
|
|
121
|
+
|
|
122
|
+
# Documentation build
|
|
123
|
+
docs/_build/
|
|
124
|
+
site/
|
|
125
|
+
|
|
126
|
+
# Internal-only files (Claude Code context + planning) — kept locally, not pushed
|
|
127
|
+
/CLAUDE.md
|
|
128
|
+
/INDEX.md
|
|
129
|
+
/PROJECT_PLAN.md
|
|
130
|
+
/CONVENTIONS.md
|
|
131
|
+
/HUMAN_REVIEW_GATES.md
|
|
132
|
+
**/CLAUDE.md
|
|
133
|
+
memory/
|
|
134
|
+
skills/
|
|
135
|
+
agents/
|
|
136
|
+
.claude/
|
|
137
|
+
TICKETS/
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: inferencebench-mt
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Machine-translation plugin for InferenceBench Suite (chrF/BLEU/exact-match on bundled fixtures)
|
|
5
|
+
Project-URL: Homepage, https://github.com/yobitelcomm/bench
|
|
6
|
+
Author-email: Yobitel Communications <bench@yobitel.com>
|
|
7
|
+
License: Apache-2.0
|
|
8
|
+
Keywords: ai,benchmark,bleu,chrf,llm,ml,translation
|
|
9
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.12
|
|
17
|
+
Requires-Dist: inferencebench-envelope
|
|
18
|
+
Requires-Dist: inferencebench-harness
|
|
19
|
+
Requires-Dist: pydantic~=2.9
|
|
20
|
+
Requires-Dist: pyyaml~=6.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# inferencebench-mt
|
|
24
|
+
|
|
25
|
+
Machine-translation plugin for the InferenceBench Suite.
|
|
26
|
+
|
|
27
|
+
Scores model translations against bundled reference fixtures using chrF (character
|
|
28
|
+
n-gram F-score), token-level BLEU, or exact match. Mirrors the contract of the
|
|
29
|
+
other plugins (`list_benchmarks` / `get_benchmark` / `validate` / `run`) and
|
|
30
|
+
emits the canonical signed envelope.
|
|
31
|
+
|
|
32
|
+
Two bundled benchmarks ship out of the box:
|
|
33
|
+
|
|
34
|
+
- `llm.mt.flores-200-mini-en-fr` — FLORES-200-style English to French, chrF.
|
|
35
|
+
- `llm.mt.flores-200-mini-en-de` — FLORES-200-style English to German, chrF.
|
|
36
|
+
|
|
37
|
+
The fixtures are tiny (eight rows each, mixed across greeting / news / technical
|
|
38
|
+
/ conversational domains) — intended for skeleton verification, not headline
|
|
39
|
+
numbers.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# inferencebench-mt
|
|
2
|
+
|
|
3
|
+
Machine-translation plugin for the InferenceBench Suite.
|
|
4
|
+
|
|
5
|
+
Scores model translations against bundled reference fixtures using chrF (character
|
|
6
|
+
n-gram F-score), token-level BLEU, or exact match. Mirrors the contract of the
|
|
7
|
+
other plugins (`list_benchmarks` / `get_benchmark` / `validate` / `run`) and
|
|
8
|
+
emits the canonical signed envelope.
|
|
9
|
+
|
|
10
|
+
Two bundled benchmarks ship out of the box:
|
|
11
|
+
|
|
12
|
+
- `llm.mt.flores-200-mini-en-fr` — FLORES-200-style English to French, chrF.
|
|
13
|
+
- `llm.mt.flores-200-mini-en-de` — FLORES-200-style English to German, chrF.
|
|
14
|
+
|
|
15
|
+
The fixtures are tiny (eight rows each, mixed across greeting / news / technical
|
|
16
|
+
/ conversational domains) — intended for skeleton verification, not headline
|
|
17
|
+
numbers.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "inferencebench-mt"
|
|
7
|
+
version = "0.0.2"
|
|
8
|
+
description = "Machine-translation plugin for InferenceBench Suite (chrF/BLEU/exact-match on bundled fixtures)"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = { text = "Apache-2.0" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Yobitel Communications", email = "bench@yobitel.com" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["benchmark", "llm", "translation", "chrf", "bleu", "ai", "ml"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 2 - Pre-Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: Apache Software License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"inferencebench-envelope",
|
|
27
|
+
"inferencebench-harness",
|
|
28
|
+
"pydantic~=2.9",
|
|
29
|
+
"pyyaml~=6.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.entry-points."inferencebench.plugins"]
|
|
33
|
+
"llm.mt" = "inferencebench_mt.plugin:LLMMTPlugin"
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/yobitelcomm/bench"
|
|
37
|
+
|
|
38
|
+
[tool.hatch.build.targets.wheel]
|
|
39
|
+
packages = ["src/inferencebench_mt"]
|
|
40
|
+
|
|
41
|
+
[tool.uv.sources]
|
|
42
|
+
inferencebench-envelope = { workspace = true }
|
|
43
|
+
inferencebench-harness = { workspace = true }
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""InferenceBench LLM machine-translation plugin."""
|
|
2
|
+
|
|
3
|
+
from inferencebench_mt.plugin import EXPECTED_METRICS, LLMMTPlugin
|
|
4
|
+
from inferencebench_mt.schemas import BenchmarkSpec, EngineKind, RunContext
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"EXPECTED_METRICS",
|
|
8
|
+
"BenchmarkSpec",
|
|
9
|
+
"EngineKind",
|
|
10
|
+
"LLMMTPlugin",
|
|
11
|
+
"RunContext",
|
|
12
|
+
]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
benchmark_id: llm.mt.flores-200-mini-en-de
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: FLORES-200-style English to German, character F-score.
|
|
4
|
+
modality: llm
|
|
5
|
+
kind: translation
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-flores-mini-en-de
|
|
8
|
+
path: flores-mini-en-de.jsonl
|
|
9
|
+
slo_template: llm.mt.standard
|
|
10
|
+
warmup:
|
|
11
|
+
discard_runs: 0
|
|
12
|
+
scoring: chrf
|
|
13
|
+
source_lang: en
|
|
14
|
+
target_lang: de
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
benchmark_id: llm.mt.flores-200-mini-en-es
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: FLORES-200-style English to Spanish, character F-score.
|
|
4
|
+
modality: llm
|
|
5
|
+
kind: translation
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-flores-mini-en-es
|
|
8
|
+
path: flores-mini-en-es.jsonl
|
|
9
|
+
slo_template: llm.mt.standard
|
|
10
|
+
warmup:
|
|
11
|
+
discard_runs: 0
|
|
12
|
+
scoring: chrf
|
|
13
|
+
source_lang: en
|
|
14
|
+
target_lang: es
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
benchmark_id: llm.mt.flores-200-mini-en-fr
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: FLORES-200-style English to French, character F-score.
|
|
4
|
+
modality: llm
|
|
5
|
+
kind: translation
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-flores-mini-en-fr
|
|
8
|
+
path: flores-mini-en-fr.jsonl
|
|
9
|
+
slo_template: llm.mt.standard
|
|
10
|
+
warmup:
|
|
11
|
+
discard_runs: 0
|
|
12
|
+
scoring: chrf
|
|
13
|
+
source_lang: en
|
|
14
|
+
target_lang: fr
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
benchmark_id: llm.mt.flores-200-mini-en-ja
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: FLORES-200-style English to Japanese, character F-score.
|
|
4
|
+
modality: llm
|
|
5
|
+
kind: translation
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-flores-mini-en-ja
|
|
8
|
+
path: flores-mini-en-ja.jsonl
|
|
9
|
+
slo_template: llm.mt.standard
|
|
10
|
+
warmup:
|
|
11
|
+
discard_runs: 0
|
|
12
|
+
scoring: chrf
|
|
13
|
+
source_lang: en
|
|
14
|
+
target_lang: ja
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
{"source": "Hello, how are you?", "reference": "Hallo, wie geht es dir?", "domain": "greeting"}
|
|
2
|
+
{"source": "Good morning, my friend.", "reference": "Guten Morgen, mein Freund.", "domain": "greeting"}
|
|
3
|
+
{"source": "The president signed the new climate agreement yesterday.", "reference": "Der Präsident unterzeichnete gestern das neue Klimaabkommen.", "domain": "news"}
|
|
4
|
+
{"source": "Stock markets fell sharply after the central bank announcement.", "reference": "Die Aktienmärkte fielen nach der Ankündigung der Zentralbank stark.", "domain": "news"}
|
|
5
|
+
{"source": "The transformer architecture uses self-attention layers.", "reference": "Die Transformer-Architektur verwendet Self-Attention-Schichten.", "domain": "technical"}
|
|
6
|
+
{"source": "Please restart the server after the update completes.", "reference": "Bitte starten Sie den Server neu, nachdem das Update abgeschlossen ist.", "domain": "technical"}
|
|
7
|
+
{"source": "I would like a coffee with milk and sugar, please.", "reference": "Ich hätte gerne einen Kaffee mit Milch und Zucker, bitte.", "domain": "conversational"}
|
|
8
|
+
{"source": "Where is the nearest train station?", "reference": "Wo ist der nächste Bahnhof?", "domain": "conversational"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
{"source": "Hello, how are you?", "reference": "Hola, ¿cómo estás?", "domain": "greeting"}
|
|
2
|
+
{"source": "Good morning, my friend.", "reference": "Buenos días, mi amigo.", "domain": "greeting"}
|
|
3
|
+
{"source": "The president signed the new climate agreement yesterday.", "reference": "El presidente firmó ayer el nuevo acuerdo climático.", "domain": "news"}
|
|
4
|
+
{"source": "Stock markets fell sharply after the central bank announcement.", "reference": "Los mercados bursátiles cayeron bruscamente tras el anuncio del banco central.", "domain": "news"}
|
|
5
|
+
{"source": "The transformer architecture uses self-attention layers.", "reference": "La arquitectura transformer utiliza capas de autoatención.", "domain": "technical"}
|
|
6
|
+
{"source": "Please restart the server after the update completes.", "reference": "Por favor, reinicia el servidor cuando termine la actualización.", "domain": "technical"}
|
|
7
|
+
{"source": "I would like a coffee with milk and sugar, please.", "reference": "Quisiera un café con leche y azúcar, por favor.", "domain": "conversational"}
|
|
8
|
+
{"source": "Where is the nearest train station?", "reference": "¿Dónde está la estación de tren más cercana?", "domain": "conversational"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
{"source": "Hello, how are you?", "reference": "Bonjour, comment allez-vous ?", "domain": "greeting"}
|
|
2
|
+
{"source": "Good morning, my friend.", "reference": "Bonjour, mon ami.", "domain": "greeting"}
|
|
3
|
+
{"source": "The president signed the new climate agreement yesterday.", "reference": "Le président a signé le nouvel accord sur le climat hier.", "domain": "news"}
|
|
4
|
+
{"source": "Stock markets fell sharply after the central bank announcement.", "reference": "Les marchés boursiers ont fortement chuté après l'annonce de la banque centrale.", "domain": "news"}
|
|
5
|
+
{"source": "The transformer architecture uses self-attention layers.", "reference": "L'architecture transformeur utilise des couches d'auto-attention.", "domain": "technical"}
|
|
6
|
+
{"source": "Please restart the server after the update completes.", "reference": "Veuillez redémarrer le serveur une fois la mise à jour terminée.", "domain": "technical"}
|
|
7
|
+
{"source": "I would like a coffee with milk and sugar, please.", "reference": "Je voudrais un café avec du lait et du sucre, s'il vous plaît.", "domain": "conversational"}
|
|
8
|
+
{"source": "Where is the nearest train station?", "reference": "Où se trouve la gare la plus proche ?", "domain": "conversational"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
{"source": "Hello, how are you?", "reference": "こんにちは、お元気ですか?", "domain": "greeting"}
|
|
2
|
+
{"source": "Good morning, my friend.", "reference": "おはよう、友よ。", "domain": "greeting"}
|
|
3
|
+
{"source": "The president signed the new climate agreement yesterday.", "reference": "大統領は昨日、新しい気候協定に署名しました。", "domain": "news"}
|
|
4
|
+
{"source": "Stock markets fell sharply after the central bank announcement.", "reference": "中央銀行の発表を受けて、株式市場は急落しました。", "domain": "news"}
|
|
5
|
+
{"source": "The transformer architecture uses self-attention layers.", "reference": "トランスフォーマーアーキテクチャは自己注意層を使用します。", "domain": "technical"}
|
|
6
|
+
{"source": "Please restart the server after the update completes.", "reference": "アップデートが完了したらサーバーを再起動してください。", "domain": "technical"}
|
|
7
|
+
{"source": "I would like a coffee with milk and sugar, please.", "reference": "ミルクと砂糖入りのコーヒーをお願いします。", "domain": "conversational"}
|
|
8
|
+
{"source": "Where is the nearest train station?", "reference": "最寄りの駅はどこですか?", "domain": "conversational"}
|