inferencebench-mt 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,137 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ share/python-wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ MANIFEST
24
+
25
+ # uv / virtualenv
26
+ .venv/
27
+ venv/
28
+ env/
29
+ ENV/
30
+ uv.lock.tmp
31
+ .python-version
32
+
33
+ # Testing / coverage
34
+ .tox/
35
+ .nox/
36
+ .coverage
37
+ .coverage.*
38
+ .cache
39
+ nosetests.xml
40
+ coverage.xml
41
+ *.cover
42
+ *.py,cover
43
+ .hypothesis/
44
+ .pytest_cache/
45
+ cover/
46
+ htmlcov/
47
+
48
+ # Type checking
49
+ .mypy_cache/
50
+ .dmypy.json
51
+ dmypy.json
52
+ .pyre/
53
+ .pytype/
54
+
55
+ # Ruff
56
+ .ruff_cache/
57
+
58
+ # IDE / editor
59
+ .idea/
60
+ .vscode/
61
+ *.swp
62
+ *.swo
63
+ *~
64
+ .DS_Store
65
+
66
+ # OS
67
+ Thumbs.db
68
+ desktop.ini
69
+
70
+ # Secrets / env
71
+ .env
72
+ .env.*
73
+ !.env.example
74
+ .envrc
75
+
76
+ # Bench-specific local caches
77
+ ~/.cache/inferencebench/
78
+ .cache/inferencebench/
79
+ .inferencebench/
80
+
81
+ # Sigstore dev keys (never commit private keys)
82
+ cosign.key
83
+ cosign-*.key
84
+ cosign-*.pub
85
+ .bench/*.key
86
+ # Local benchmark working dirs (kept local; published outputs land under validation-runs/)
87
+ envelopes-voice/
88
+ envelopes-*/
89
+ *.pem
90
+ !tests/fixtures/**/*.pem
91
+
92
+ # Real-GPU validation artifacts (kept locally, never pushed)
93
+ # Use slash-star (not trailing slash) so individual subpaths can be re-included below.
94
+ validation-runs/*
95
+ # ...except the canonical published marathon corpus — small, public, used by docs + CI
96
+ !validation-runs/2026-05-18-multi-vendor-marathon
97
+ validation-runs/2026-05-18-multi-vendor-marathon/*
98
+ !validation-runs/2026-05-18-multi-vendor-marathon/marathon
99
+ validation-runs/2026-05-18-multi-vendor-marathon/marathon/*
100
+ !validation-runs/2026-05-18-multi-vendor-marathon/marathon/all
101
+ !validation-runs/2026-05-18-multi-vendor-marathon/marathon/all/*.json
102
+ # Voice ASR validation envelopes (small, public, used by leaderboard build)
103
+ !validation-runs/2026-05-25-voice-rtx4000ada
104
+ !validation-runs/2026-05-25-voice-rtx4000ada/*.json
105
+ !validation-runs/2026-05-29-voice-testbm-h100
106
+ !validation-runs/2026-05-29-voice-testbm-h100/*.json
107
+
108
+ # Model weights / datasets (use Git LFS or S3)
109
+ *.bin
110
+ *.safetensors
111
+ *.pt
112
+ *.pth
113
+ *.gguf
114
+ *.onnx
115
+ *.parquet
116
+ !tests/fixtures/**/*.parquet
117
+
118
+ # Logs
119
+ *.log
120
+ logs/
121
+
122
+ # Documentation build
123
+ docs/_build/
124
+ site/
125
+
126
+ # Internal-only files (Claude Code context + planning) — kept locally, not pushed
127
+ /CLAUDE.md
128
+ /INDEX.md
129
+ /PROJECT_PLAN.md
130
+ /CONVENTIONS.md
131
+ /HUMAN_REVIEW_GATES.md
132
+ **/CLAUDE.md
133
+ memory/
134
+ skills/
135
+ agents/
136
+ .claude/
137
+ TICKETS/
@@ -0,0 +1,39 @@
1
+ Metadata-Version: 2.4
2
+ Name: inferencebench-mt
3
+ Version: 0.0.2
4
+ Summary: Machine-translation plugin for InferenceBench Suite (chrF/BLEU/exact-match on bundled fixtures)
5
+ Project-URL: Homepage, https://github.com/yobitelcomm/bench
6
+ Author-email: Yobitel Communications <bench@yobitel.com>
7
+ License: Apache-2.0
8
+ Keywords: ai,benchmark,bleu,chrf,llm,ml,translation
9
+ Classifier: Development Status :: 2 - Pre-Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.12
17
+ Requires-Dist: inferencebench-envelope
18
+ Requires-Dist: inferencebench-harness
19
+ Requires-Dist: pydantic~=2.9
20
+ Requires-Dist: pyyaml~=6.0
21
+ Description-Content-Type: text/markdown
22
+
23
+ # inferencebench-mt
24
+
25
+ Machine-translation plugin for the InferenceBench Suite.
26
+
27
+ Scores model translations against bundled reference fixtures using chrF (character
28
+ n-gram F-score), token-level BLEU, or exact match. Mirrors the contract of the
29
+ other plugins (`list_benchmarks` / `get_benchmark` / `validate` / `run`) and
30
+ emits the canonical signed envelope.
31
+
32
+ Two bundled benchmarks ship out of the box:
33
+
34
+ - `llm.mt.flores-200-mini-en-fr` — FLORES-200-style English to French, chrF.
35
+ - `llm.mt.flores-200-mini-en-de` — FLORES-200-style English to German, chrF.
36
+
37
+ The fixtures are tiny (eight rows each, mixed across greeting / news / technical
38
+ / conversational domains) — intended for skeleton verification, not headline
39
+ numbers.
@@ -0,0 +1,17 @@
1
+ # inferencebench-mt
2
+
3
+ Machine-translation plugin for the InferenceBench Suite.
4
+
5
+ Scores model translations against bundled reference fixtures using chrF (character
6
+ n-gram F-score), token-level BLEU, or exact match. Mirrors the contract of the
7
+ other plugins (`list_benchmarks` / `get_benchmark` / `validate` / `run`) and
8
+ emits the canonical signed envelope.
9
+
10
+ Two bundled benchmarks ship out of the box:
11
+
12
+ - `llm.mt.flores-200-mini-en-fr` — FLORES-200-style English to French, chrF.
13
+ - `llm.mt.flores-200-mini-en-de` — FLORES-200-style English to German, chrF.
14
+
15
+ The fixtures are tiny (eight rows each, mixed across greeting / news / technical
16
+ / conversational domains) — intended for skeleton verification, not headline
17
+ numbers.
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "inferencebench-mt"
7
+ version = "0.0.2"
8
+ description = "Machine-translation plugin for InferenceBench Suite (chrF/BLEU/exact-match on bundled fixtures)"
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = { text = "Apache-2.0" }
12
+ authors = [
13
+ { name = "Yobitel Communications", email = "bench@yobitel.com" },
14
+ ]
15
+ keywords = ["benchmark", "llm", "translation", "chrf", "bleu", "ai", "ml"]
16
+ classifiers = [
17
+ "Development Status :: 2 - Pre-Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: Apache Software License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
24
+ ]
25
+ dependencies = [
26
+ "inferencebench-envelope",
27
+ "inferencebench-harness",
28
+ "pydantic~=2.9",
29
+ "pyyaml~=6.0",
30
+ ]
31
+
32
+ [project.entry-points."inferencebench.plugins"]
33
+ "llm.mt" = "inferencebench_mt.plugin:LLMMTPlugin"
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/yobitelcomm/bench"
37
+
38
+ [tool.hatch.build.targets.wheel]
39
+ packages = ["src/inferencebench_mt"]
40
+
41
+ [tool.uv.sources]
42
+ inferencebench-envelope = { workspace = true }
43
+ inferencebench-harness = { workspace = true }
@@ -0,0 +1,12 @@
1
+ """InferenceBench LLM machine-translation plugin."""
2
+
3
+ from inferencebench_mt.plugin import EXPECTED_METRICS, LLMMTPlugin
4
+ from inferencebench_mt.schemas import BenchmarkSpec, EngineKind, RunContext
5
+
6
+ __all__ = [
7
+ "EXPECTED_METRICS",
8
+ "BenchmarkSpec",
9
+ "EngineKind",
10
+ "LLMMTPlugin",
11
+ "RunContext",
12
+ ]
@@ -0,0 +1,14 @@
1
+ benchmark_id: llm.mt.flores-200-mini-en-de
2
+ suite_version: 1.0.0
3
+ description: FLORES-200-style English to German, character F-score.
4
+ modality: llm
5
+ kind: translation
6
+ dataset:
7
+ id: builtin-flores-mini-en-de
8
+ path: flores-mini-en-de.jsonl
9
+ slo_template: llm.mt.standard
10
+ warmup:
11
+ discard_runs: 0
12
+ scoring: chrf
13
+ source_lang: en
14
+ target_lang: de
@@ -0,0 +1,14 @@
1
+ benchmark_id: llm.mt.flores-200-mini-en-es
2
+ suite_version: 1.0.0
3
+ description: FLORES-200-style English to Spanish, character F-score.
4
+ modality: llm
5
+ kind: translation
6
+ dataset:
7
+ id: builtin-flores-mini-en-es
8
+ path: flores-mini-en-es.jsonl
9
+ slo_template: llm.mt.standard
10
+ warmup:
11
+ discard_runs: 0
12
+ scoring: chrf
13
+ source_lang: en
14
+ target_lang: es
@@ -0,0 +1,14 @@
1
+ benchmark_id: llm.mt.flores-200-mini-en-fr
2
+ suite_version: 1.0.0
3
+ description: FLORES-200-style English to French, character F-score.
4
+ modality: llm
5
+ kind: translation
6
+ dataset:
7
+ id: builtin-flores-mini-en-fr
8
+ path: flores-mini-en-fr.jsonl
9
+ slo_template: llm.mt.standard
10
+ warmup:
11
+ discard_runs: 0
12
+ scoring: chrf
13
+ source_lang: en
14
+ target_lang: fr
@@ -0,0 +1,14 @@
1
+ benchmark_id: llm.mt.flores-200-mini-en-ja
2
+ suite_version: 1.0.0
3
+ description: FLORES-200-style English to Japanese, character F-score.
4
+ modality: llm
5
+ kind: translation
6
+ dataset:
7
+ id: builtin-flores-mini-en-ja
8
+ path: flores-mini-en-ja.jsonl
9
+ slo_template: llm.mt.standard
10
+ warmup:
11
+ discard_runs: 0
12
+ scoring: chrf
13
+ source_lang: en
14
+ target_lang: ja
@@ -0,0 +1,8 @@
1
+ {"source": "Hello, how are you?", "reference": "Hallo, wie geht es dir?", "domain": "greeting"}
2
+ {"source": "Good morning, my friend.", "reference": "Guten Morgen, mein Freund.", "domain": "greeting"}
3
+ {"source": "The president signed the new climate agreement yesterday.", "reference": "Der Präsident unterzeichnete gestern das neue Klimaabkommen.", "domain": "news"}
4
+ {"source": "Stock markets fell sharply after the central bank announcement.", "reference": "Die Aktienmärkte fielen nach der Ankündigung der Zentralbank stark.", "domain": "news"}
5
+ {"source": "The transformer architecture uses self-attention layers.", "reference": "Die Transformer-Architektur verwendet Self-Attention-Schichten.", "domain": "technical"}
6
+ {"source": "Please restart the server after the update completes.", "reference": "Bitte starten Sie den Server neu, nachdem das Update abgeschlossen ist.", "domain": "technical"}
7
+ {"source": "I would like a coffee with milk and sugar, please.", "reference": "Ich hätte gerne einen Kaffee mit Milch und Zucker, bitte.", "domain": "conversational"}
8
+ {"source": "Where is the nearest train station?", "reference": "Wo ist der nächste Bahnhof?", "domain": "conversational"}
@@ -0,0 +1,8 @@
1
+ {"source": "Hello, how are you?", "reference": "Hola, ¿cómo estás?", "domain": "greeting"}
2
+ {"source": "Good morning, my friend.", "reference": "Buenos días, mi amigo.", "domain": "greeting"}
3
+ {"source": "The president signed the new climate agreement yesterday.", "reference": "El presidente firmó ayer el nuevo acuerdo climático.", "domain": "news"}
4
+ {"source": "Stock markets fell sharply after the central bank announcement.", "reference": "Los mercados bursátiles cayeron bruscamente tras el anuncio del banco central.", "domain": "news"}
5
+ {"source": "The transformer architecture uses self-attention layers.", "reference": "La arquitectura transformer utiliza capas de autoatención.", "domain": "technical"}
6
+ {"source": "Please restart the server after the update completes.", "reference": "Por favor, reinicia el servidor cuando termine la actualización.", "domain": "technical"}
7
+ {"source": "I would like a coffee with milk and sugar, please.", "reference": "Quisiera un café con leche y azúcar, por favor.", "domain": "conversational"}
8
+ {"source": "Where is the nearest train station?", "reference": "¿Dónde está la estación de tren más cercana?", "domain": "conversational"}
@@ -0,0 +1,8 @@
1
+ {"source": "Hello, how are you?", "reference": "Bonjour, comment allez-vous ?", "domain": "greeting"}
2
+ {"source": "Good morning, my friend.", "reference": "Bonjour, mon ami.", "domain": "greeting"}
3
+ {"source": "The president signed the new climate agreement yesterday.", "reference": "Le président a signé le nouvel accord sur le climat hier.", "domain": "news"}
4
+ {"source": "Stock markets fell sharply after the central bank announcement.", "reference": "Les marchés boursiers ont fortement chuté après l'annonce de la banque centrale.", "domain": "news"}
5
+ {"source": "The transformer architecture uses self-attention layers.", "reference": "L'architecture transformeur utilise des couches d'auto-attention.", "domain": "technical"}
6
+ {"source": "Please restart the server after the update completes.", "reference": "Veuillez redémarrer le serveur une fois la mise à jour terminée.", "domain": "technical"}
7
+ {"source": "I would like a coffee with milk and sugar, please.", "reference": "Je voudrais un café avec du lait et du sucre, s'il vous plaît.", "domain": "conversational"}
8
+ {"source": "Where is the nearest train station?", "reference": "Où se trouve la gare la plus proche ?", "domain": "conversational"}
@@ -0,0 +1,8 @@
1
+ {"source": "Hello, how are you?", "reference": "こんにちは、お元気ですか?", "domain": "greeting"}
2
+ {"source": "Good morning, my friend.", "reference": "おはよう、友よ。", "domain": "greeting"}
3
+ {"source": "The president signed the new climate agreement yesterday.", "reference": "大統領は昨日、新しい気候協定に署名しました。", "domain": "news"}
4
+ {"source": "Stock markets fell sharply after the central bank announcement.", "reference": "中央銀行の発表を受けて、株式市場は急落しました。", "domain": "news"}
5
+ {"source": "The transformer architecture uses self-attention layers.", "reference": "トランスフォーマーアーキテクチャは自己注意層を使用します。", "domain": "technical"}
6
+ {"source": "Please restart the server after the update completes.", "reference": "アップデートが完了したらサーバーを再起動してください。", "domain": "technical"}
7
+ {"source": "I would like a coffee with milk and sugar, please.", "reference": "ミルクと砂糖入りのコーヒーをお願いします。", "domain": "conversational"}
8
+ {"source": "Where is the nearest train station?", "reference": "最寄りの駅はどこですか?", "domain": "conversational"}