commonlid 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {commonlid-0.2.2 → commonlid-0.2.3}/PKG-INFO +6 -2
- {commonlid-0.2.2 → commonlid-0.2.3}/README.md +3 -1
- {commonlid-0.2.2 → commonlid-0.2.3}/pyproject.toml +9 -2
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/evaluation/evaluator.py +16 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/evaluation/results.py +12 -2
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/leaderboard/app.py +143 -19
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/leaderboard/data.py +125 -2
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/__init__.py +1 -0
- commonlid-0.2.3/src/commonlid/models/commonlingua.py +115 -0
- commonlid-0.2.3/src/commonlid/vendor/commonlingua/__init__.py +4 -0
- commonlid-0.2.3/src/commonlid/vendor/commonlingua/model.py +186 -0
- commonlid-0.2.3/tests/models/test_commonlingua.py +95 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_model_registration.py +1 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_leaderboard_data.py +182 -3
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_results_io.py +30 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/.gitignore +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/LICENSE +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/docs/architecture.md +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/hf-space/README.md +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/notebooks/README.md +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/_version.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/cli.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/core/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/core/lid_dataset.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/core/lid_model.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/core/registry.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/bibles.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/commonlid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/flores_dev.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/nano.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/smolsent.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/social_media.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/udhr.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets_tools/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets_tools/frequency_sample.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets_tools/stratified_sample.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/evaluation/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/evaluation/cache.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/leaderboard/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/logging.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/metrics/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/metrics/aggregate.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/metrics/core.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/metrics/fpr.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/metrics/support_matrix.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/_fasttext_base.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/afrolid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/cld2.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/cld3.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/dspy_llm.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/fasttext_ft.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/funlangid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/glotlid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/openlidv2.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/pyfranc.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/preprocess/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/preprocess/langcodes.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/preprocess/openlid_normer.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/py.typed +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/vendor/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/vendor/fun_langid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/conftest.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/fixtures/preprocess_golden.jsonl +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/fixtures/tiny_lid.jsonl +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/fixtures/tiny_support_matrix.csv +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/hf-space/test_space_entrypoint.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_bibles_build_vs_cache.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_cli_end_to_end.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_cli_generate_support_matrix.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_nano_build_vs_cache.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_readme_examples.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_smoke_parity.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_smoke_parity_commonlid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_smolsent_build_vs_cache.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/legacy/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/legacy/langid_datasets.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/legacy/langid_models.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_cld2.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_cld3.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_discover_supported_languages.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_dspy_llm.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_fasttext_base.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_funlangid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_pyfranc.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_cache.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_cli_stub.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_dataset_build_from_source.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_dataset_registration.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_evaluator.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_frequency_sample.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_langcodes.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_lid_model_base.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_metrics_aggregate.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_metrics_core.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_metrics_fpr.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_nano_datasets.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_notebook_validity.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_preprocess.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_private_dataset_error.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_registry.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_stratified_sample.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_support_matrix.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: commonlid
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Evaluate language identification models on CommonLID and other benchmarks.
|
|
5
5
|
Project-URL: Homepage, https://huggingface.co/datasets/commoncrawl/CommonLID
|
|
6
6
|
Project-URL: Paper, https://arxiv.org/abs/2601.18026
|
|
@@ -247,6 +247,8 @@ Requires-Dist: torch>=2.4; extra == 'all'
|
|
|
247
247
|
Requires-Dist: transformers<5,>=4.46; extra == 'all'
|
|
248
248
|
Provides-Extra: cld3
|
|
249
249
|
Requires-Dist: cld3-py>=3.1; extra == 'cld3'
|
|
250
|
+
Provides-Extra: commonlingua
|
|
251
|
+
Requires-Dist: torch>=2.4; extra == 'commonlingua'
|
|
250
252
|
Provides-Extra: dev
|
|
251
253
|
Requires-Dist: azure-identity>=1.17; extra == 'dev'
|
|
252
254
|
Requires-Dist: botocore>=1.35; extra == 'dev'
|
|
@@ -315,6 +317,7 @@ From PyPI:
|
|
|
315
317
|
pip install commonlid # core deps + classical LID models
|
|
316
318
|
pip install "commonlid[llm]" # + DSPy-based LLM evaluation
|
|
317
319
|
pip install "commonlid[afrolid]" # + torch/transformers for AfroLID
|
|
320
|
+
pip install "commonlid[commonlingua]" # + torch for the CommonLingua byte-level model
|
|
318
321
|
pip install "commonlid[notebooks]" # + jupyterlab + matplotlib for paper_tables.ipynb
|
|
319
322
|
pip install "commonlid[all]" # everything runtime-facing
|
|
320
323
|
```
|
|
@@ -468,7 +471,7 @@ from commonlid import list_models, list_datasets
|
|
|
468
471
|
|
|
469
472
|
assert list_models() == [
|
|
470
473
|
"AfroLID", "GlotLID", "OpenLID-v2", "cld2", "cld3",
|
|
471
|
-
"fasttext", "funlangid", "pyfranc",
|
|
474
|
+
"commonlingua", "fasttext", "funlangid", "pyfranc",
|
|
472
475
|
]
|
|
473
476
|
assert list_datasets() == [
|
|
474
477
|
"bibles_300", "bibles_300_nano",
|
|
@@ -574,6 +577,7 @@ for line in preds_path.read_text().splitlines():
|
|
|
574
577
|
| `fasttext` | [facebook/fasttext-language-identification](https://huggingface.co/facebook/fasttext-language-identification) | fasttext |
|
|
575
578
|
| `pyfranc` | [pyfranc](https://pypi.org/project/pyfranc/) | Pure Python |
|
|
576
579
|
| `AfroLID` | [UBC-NLP/afrolid_1.5](https://huggingface.co/UBC-NLP/afrolid_1.5) | Requires `[afrolid]` extra |
|
|
580
|
+
| `commonlingua` | [PleIAs/CommonLingua](https://huggingface.co/PleIAs/CommonLingua) | 2.35M-param byte-level model, 334 languages; requires `[commonlingua]` extra |
|
|
577
581
|
| `funlangid` | Vendored in `src/commonlid/vendor/fun_langid.py` | Simple char-4gram baseline |
|
|
578
582
|
|
|
579
583
|
LLM models are instantiated dynamically (`DSPyLLMModel`) and not
|
|
@@ -39,6 +39,7 @@ From PyPI:
|
|
|
39
39
|
pip install commonlid # core deps + classical LID models
|
|
40
40
|
pip install "commonlid[llm]" # + DSPy-based LLM evaluation
|
|
41
41
|
pip install "commonlid[afrolid]" # + torch/transformers for AfroLID
|
|
42
|
+
pip install "commonlid[commonlingua]" # + torch for the CommonLingua byte-level model
|
|
42
43
|
pip install "commonlid[notebooks]" # + jupyterlab + matplotlib for paper_tables.ipynb
|
|
43
44
|
pip install "commonlid[all]" # everything runtime-facing
|
|
44
45
|
```
|
|
@@ -192,7 +193,7 @@ from commonlid import list_models, list_datasets
|
|
|
192
193
|
|
|
193
194
|
assert list_models() == [
|
|
194
195
|
"AfroLID", "GlotLID", "OpenLID-v2", "cld2", "cld3",
|
|
195
|
-
"fasttext", "funlangid", "pyfranc",
|
|
196
|
+
"commonlingua", "fasttext", "funlangid", "pyfranc",
|
|
196
197
|
]
|
|
197
198
|
assert list_datasets() == [
|
|
198
199
|
"bibles_300", "bibles_300_nano",
|
|
@@ -298,6 +299,7 @@ for line in preds_path.read_text().splitlines():
|
|
|
298
299
|
| `fasttext` | [facebook/fasttext-language-identification](https://huggingface.co/facebook/fasttext-language-identification) | fasttext |
|
|
299
300
|
| `pyfranc` | [pyfranc](https://pypi.org/project/pyfranc/) | Pure Python |
|
|
300
301
|
| `AfroLID` | [UBC-NLP/afrolid_1.5](https://huggingface.co/UBC-NLP/afrolid_1.5) | Requires `[afrolid]` extra |
|
|
302
|
+
| `commonlingua` | [PleIAs/CommonLingua](https://huggingface.co/PleIAs/CommonLingua) | 2.35M-param byte-level model, 334 languages; requires `[commonlingua]` extra |
|
|
301
303
|
| `funlangid` | Vendored in `src/commonlid/vendor/fun_langid.py` | Simple char-4gram baseline |
|
|
302
304
|
|
|
303
305
|
LLM models are instantiated dynamically (`DSPyLLMModel`) and not
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "commonlid"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.3"
|
|
8
8
|
description = "Evaluate language identification models on CommonLID and other benchmarks."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -59,6 +59,11 @@ llm = [
|
|
|
59
59
|
"botocore>=1.35",
|
|
60
60
|
]
|
|
61
61
|
cld3 = ["cld3-py>=3.1"]
|
|
62
|
+
commonlingua = [
|
|
63
|
+
# CommonLingua is a 2.35M-param byte-level model; needs torch but not the
|
|
64
|
+
# transformers stack that [afrolid] pulls in.
|
|
65
|
+
"torch>=2.4",
|
|
66
|
+
]
|
|
62
67
|
leaderboard = [
|
|
63
68
|
# gradio 4.x imports HfFolder from huggingface_hub, which was removed in
|
|
64
69
|
# huggingface-hub 1.0; gradio 5 dropped that import.
|
|
@@ -88,7 +93,7 @@ notebooks = [
|
|
|
88
93
|
"nbclient>=0.10",
|
|
89
94
|
]
|
|
90
95
|
all = [
|
|
91
|
-
"commonlid[afrolid,llm]",
|
|
96
|
+
"commonlid[afrolid,llm,commonlingua]",
|
|
92
97
|
]
|
|
93
98
|
|
|
94
99
|
[project.scripts]
|
|
@@ -208,6 +213,8 @@ omit = [
|
|
|
208
213
|
# afrolid needs the heavy `[afrolid]` extra (torch + transformers); not
|
|
209
214
|
# installed in dev and so exercised only via mocked unit tests.
|
|
210
215
|
"src/commonlid/models/afrolid.py",
|
|
216
|
+
# commonlingua needs the `[commonlingua]` extra (torch); same precedent.
|
|
217
|
+
"src/commonlid/models/commonlingua.py",
|
|
211
218
|
]
|
|
212
219
|
|
|
213
220
|
[tool.coverage.report]
|
|
@@ -159,6 +159,21 @@ class Evaluator:
|
|
|
159
159
|
)
|
|
160
160
|
n_with_gold = sum(1 for g in ytrue if g is not None)
|
|
161
161
|
samples_per_second = (len(ytrue) / elapsed) if elapsed > 0 else 0.0
|
|
162
|
+
# `None` here is meaningful: it tells downstream consumers that the
|
|
163
|
+
# model's support set is undefined (e.g. LLMs), distinct from a model
|
|
164
|
+
# that declared an empty set. Errors during discovery downgrade to
|
|
165
|
+
# the same "unknown" sentinel rather than crashing the run.
|
|
166
|
+
try:
|
|
167
|
+
supported = model.discover_supported_languages()
|
|
168
|
+
except Exception as exc:
|
|
169
|
+
logger.warning(
|
|
170
|
+
"%s discover_supported_languages() raised %s: %s -- recording as None",
|
|
171
|
+
prefix,
|
|
172
|
+
type(exc).__name__,
|
|
173
|
+
exc,
|
|
174
|
+
)
|
|
175
|
+
supported = None
|
|
176
|
+
supported_languages = sorted(supported) if supported is not None else None
|
|
162
177
|
result = Result(
|
|
163
178
|
model_id=model.model_id,
|
|
164
179
|
dataset_id=dataset.dataset_id,
|
|
@@ -170,6 +185,7 @@ class Evaluator:
|
|
|
170
185
|
limit=self.config.limit,
|
|
171
186
|
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
172
187
|
commonlid_version=__version__,
|
|
188
|
+
supported_languages=supported_languages,
|
|
173
189
|
)
|
|
174
190
|
|
|
175
191
|
run_dir = self.config.output_dir / dataset.dataset_id / model.model_id
|
|
@@ -13,12 +13,20 @@ from typing import Any
|
|
|
13
13
|
from commonlid.metrics.aggregate import macro_average, micro_average
|
|
14
14
|
from commonlid.metrics.core import LanguageMetrics
|
|
15
15
|
|
|
16
|
-
SCHEMA_VERSION =
|
|
16
|
+
SCHEMA_VERSION = 3
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
@dataclass(slots=True)
|
|
20
20
|
class Result:
|
|
21
|
-
"""Aggregate outcome of one model evaluated on one dataset.
|
|
21
|
+
"""Aggregate outcome of one model evaluated on one dataset.
|
|
22
|
+
|
|
23
|
+
``supported_languages`` follows a tri-state convention shared with
|
|
24
|
+
:meth:`LIDModel.discover_supported_languages`: ``None`` means the
|
|
25
|
+
model's support set is undefined (e.g. LLM-based models that can be
|
|
26
|
+
prompted for any language), a list of ISO 639-3 codes is the closed
|
|
27
|
+
set the model declares, and an empty list is the degenerate "supports
|
|
28
|
+
zero languages" case. The leaderboard's ``(cov.)`` view consumes this.
|
|
29
|
+
"""
|
|
22
30
|
|
|
23
31
|
model_id: str
|
|
24
32
|
dataset_id: str
|
|
@@ -32,6 +40,7 @@ class Result:
|
|
|
32
40
|
commonlid_version: str = ""
|
|
33
41
|
python_version: str = field(default_factory=lambda: sys.version.split()[0])
|
|
34
42
|
platform: str = field(default_factory=platform.platform)
|
|
43
|
+
supported_languages: list[str] | None = None
|
|
35
44
|
extra: dict[str, Any] = field(default_factory=dict)
|
|
36
45
|
|
|
37
46
|
def summary(self) -> dict[str, Any]:
|
|
@@ -52,6 +61,7 @@ class Result:
|
|
|
52
61
|
"macro": macro_average(self.per_language),
|
|
53
62
|
"micro": micro_average(self.per_language),
|
|
54
63
|
"per_language": {lang: asdict(m) for lang, m in sorted(self.per_language.items())},
|
|
64
|
+
"supported_languages": self.supported_languages,
|
|
55
65
|
"extra": self.extra,
|
|
56
66
|
}
|
|
57
67
|
|
|
@@ -42,6 +42,20 @@ BLOG_URL = (
|
|
|
42
42
|
)
|
|
43
43
|
PAPER_URL = "https://arxiv.org/abs/2601.18026"
|
|
44
44
|
|
|
45
|
+
Scope = Literal["all", "cov"]
|
|
46
|
+
|
|
47
|
+
#: Radio choices shown above each dataset's results table.
|
|
48
|
+
SCOPE_CHOICES: list[tuple[str, Scope]] = [
|
|
49
|
+
("Scores are calculated over the whole dataset.", "all"),
|
|
50
|
+
(
|
|
51
|
+
"Scores are calculated on the subset of language varieties covered by the model. (cov.)",
|
|
52
|
+
"cov",
|
|
53
|
+
),
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
#: Sentinel string used when a row has no cov data (rendered as em-dash).
|
|
57
|
+
_NA_DISPLAY = "—"
|
|
58
|
+
|
|
45
59
|
#: Display columns in the headline table (in order). Macro F1 is the headline metric.
|
|
46
60
|
_HEADLINE_COLUMNS: list[tuple[str, str]] = [
|
|
47
61
|
("model_id", "Model"),
|
|
@@ -51,6 +65,19 @@ _HEADLINE_COLUMNS: list[tuple[str, str]] = [
|
|
|
51
65
|
("n_languages", "Languages"),
|
|
52
66
|
("samples_per_second", "Samples/s"),
|
|
53
67
|
]
|
|
68
|
+
|
|
69
|
+
#: Same columns, projected from the ``*_cov`` source fields. Display
|
|
70
|
+
#: labels stay identical so the table layout doesn't shift when the
|
|
71
|
+
#: scope radio is toggled.
|
|
72
|
+
_HEADLINE_COLUMNS_COV: list[tuple[str, str]] = [
|
|
73
|
+
("model_id", "Model"),
|
|
74
|
+
("macro_f1_cov", "Macro F1"),
|
|
75
|
+
("micro_f1_cov", "Micro F1"),
|
|
76
|
+
("mean_fpr_cov", "Mean FPR (%)"),
|
|
77
|
+
("n_languages_cov", "Languages"),
|
|
78
|
+
("samples_per_second", "Samples/s"),
|
|
79
|
+
]
|
|
80
|
+
|
|
54
81
|
#: Right-aligned numeric columns get the ``number`` Gradio datatype which
|
|
55
82
|
#: pushes values to the right edge of the cell.
|
|
56
83
|
_GradioDtype = Literal["str", "number", "bool", "date", "markdown", "html"]
|
|
@@ -134,6 +161,46 @@ _DRILLDOWN_COLUMN_HELP: list[tuple[str, str]] = [
|
|
|
134
161
|
]
|
|
135
162
|
|
|
136
163
|
|
|
164
|
+
#: Per-column human descriptions for the **(cov.)** view — same metrics,
|
|
165
|
+
#: but restricted to the model's declared support set.
|
|
166
|
+
_HEADLINE_COLUMN_HELP_COV: list[tuple[str, str]] = [
|
|
167
|
+
("Model", "Identifier of the language identification model."),
|
|
168
|
+
(
|
|
169
|
+
"Macro F1",
|
|
170
|
+
"Unweighted mean of per-language F1 (x100) **restricted to languages the "
|
|
171
|
+
"model declares it supports** (paper `(cov.)` definition). Languages outside "
|
|
172
|
+
"the model's support set are excluded from the average — a model that covers "
|
|
173
|
+
"a small but accurate subset of the benchmark is no longer penalised for the "
|
|
174
|
+
"long tail of languages it never claimed to handle. **Higher is better.** "
|
|
175
|
+
f"Models without a declared support set show `{_NA_DISPLAY}`.",
|
|
176
|
+
),
|
|
177
|
+
(
|
|
178
|
+
"Micro F1",
|
|
179
|
+
"Sample-weighted F1 (x100) pooled over the **model-supported subset** of "
|
|
180
|
+
"gold samples only. **Higher is better.** "
|
|
181
|
+
f"`{_NA_DISPLAY}` when no support set is declared.",
|
|
182
|
+
),
|
|
183
|
+
(
|
|
184
|
+
"Mean FPR (%)",
|
|
185
|
+
"Mean per-language false-positive rate computed only on samples whose gold "
|
|
186
|
+
"language is in the model's support set; TN counts confusion across other "
|
|
187
|
+
"supported languages, not the long tail. **Lower is better.** "
|
|
188
|
+
f"`{_NA_DISPLAY}` when no support set is declared.",
|
|
189
|
+
),
|
|
190
|
+
(
|
|
191
|
+
"Languages",
|
|
192
|
+
"Number of model-supported languages that have at least one gold sample in "
|
|
193
|
+
"this dataset (`|supported ∩ gold|`). This is the size of the slice every "
|
|
194
|
+
"other `(cov.)` metric is averaged over.",
|
|
195
|
+
),
|
|
196
|
+
(
|
|
197
|
+
"Samples/s",
|
|
198
|
+
"Throughput during evaluation (samples processed per second). Unaffected by "
|
|
199
|
+
"the scope toggle — it is a model-property, not a metric.",
|
|
200
|
+
),
|
|
201
|
+
]
|
|
202
|
+
|
|
203
|
+
|
|
137
204
|
def _columns_help_markdown(items: list[tuple[str, str]]) -> str:
|
|
138
205
|
"""Render a (column, description) list as a Markdown bullet block."""
|
|
139
206
|
return "\n".join(f"- **{label}** — {desc}" for label, desc in items)
|
|
@@ -157,30 +224,55 @@ def _styled_value(table: Any, right_align_after_col: int = 0) -> dict[str, Any]:
|
|
|
157
224
|
return {"data": data, "headers": headers, "metadata": {"styling": styling}}
|
|
158
225
|
|
|
159
226
|
|
|
160
|
-
def
|
|
227
|
+
def _fmt(value: Any, decimals: int, *, scale: float = 1.0) -> str:
|
|
228
|
+
"""Format a numeric value with ``decimals`` precision, em-dash for ``None``/``NaN``."""
|
|
229
|
+
import pandas as pd
|
|
230
|
+
|
|
231
|
+
if value is None or (isinstance(value, float) and pd.isna(value)):
|
|
232
|
+
return _NA_DISPLAY
|
|
233
|
+
return f"{float(value) * scale:.{decimals}f}"
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _format_table(df: Any, scope: Scope = "all") -> Any:
|
|
161
237
|
"""Project + format a results DataFrame for one Gradio tab.
|
|
162
238
|
|
|
163
239
|
Numeric columns are converted to **fixed-decimal strings** (e.g. ``0.00``
|
|
164
240
|
not ``0``) so the rendered cells line up vertically; sort ordering is
|
|
165
|
-
preserved by sorting on the raw
|
|
241
|
+
preserved by sorting on the raw float *before* formatting.
|
|
166
242
|
|
|
167
243
|
- Macro F1 / Micro F1 / Samples/s use **1 decimal**.
|
|
168
244
|
- Mean FPR (%) uses **2 decimals**.
|
|
245
|
+
- In ``scope="cov"``, rows without ``supported_languages`` data render
|
|
246
|
+
em-dashes for every cov metric and sort to the bottom.
|
|
169
247
|
"""
|
|
170
248
|
import pandas as pd
|
|
171
249
|
|
|
250
|
+
columns = _HEADLINE_COLUMNS_COV if scope == "cov" else _HEADLINE_COLUMNS
|
|
251
|
+
display_labels = [label for _, label in columns]
|
|
172
252
|
if df.empty:
|
|
173
|
-
return pd.DataFrame(columns=
|
|
253
|
+
return pd.DataFrame(columns=display_labels)
|
|
254
|
+
|
|
174
255
|
out = df.copy()
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
out
|
|
180
|
-
out
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
256
|
+
source = {key: key for key, _ in columns}
|
|
257
|
+
sort_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"]
|
|
258
|
+
# ``na_position="last"`` sinks rows without cov data to the bottom of
|
|
259
|
+
# the (cov.) view; the "all" view has no NaNs in this column.
|
|
260
|
+
out = out.sort_values(sort_key, ascending=False, kind="stable", na_position="last")
|
|
261
|
+
out = out.reset_index(drop=True)
|
|
262
|
+
|
|
263
|
+
macro_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"]
|
|
264
|
+
micro_key = source["micro_f1_cov"] if scope == "cov" else source["micro_f1"]
|
|
265
|
+
fpr_key = source["mean_fpr_cov"] if scope == "cov" else source["mean_fpr"]
|
|
266
|
+
langs_key = source["n_languages_cov"] if scope == "cov" else source["n_languages"]
|
|
267
|
+
|
|
268
|
+
out[macro_key] = out[macro_key].map(lambda x: _fmt(x, 1, scale=100))
|
|
269
|
+
out[micro_key] = out[micro_key].map(lambda x: _fmt(x, 1, scale=100))
|
|
270
|
+
out[fpr_key] = out[fpr_key].map(lambda x: _fmt(x, 2, scale=100))
|
|
271
|
+
out[langs_key] = out[langs_key].map(lambda x: _fmt(x, 0))
|
|
272
|
+
out["samples_per_second"] = out["samples_per_second"].map(lambda x: _fmt(x, 1))
|
|
273
|
+
|
|
274
|
+
out = out[[k for k, _ in columns]]
|
|
275
|
+
out.columns = display_labels
|
|
184
276
|
return out
|
|
185
277
|
|
|
186
278
|
|
|
@@ -314,23 +406,30 @@ def _format_license(license_name: str, license_url: str | None) -> str:
|
|
|
314
406
|
|
|
315
407
|
def _make_select_handler(
|
|
316
408
|
dataset_id: str,
|
|
317
|
-
table: Any,
|
|
318
409
|
snapshot_root: Path,
|
|
319
410
|
) -> Any:
|
|
320
411
|
"""Build the row-select callback as a closure over the captured state.
|
|
321
412
|
|
|
413
|
+
The callback looks up the clicked row in the *current* table value
|
|
414
|
+
(passed in via Gradio's event arg) so that switching the scope radio
|
|
415
|
+
and then clicking a row drills down the row at its post-toggle
|
|
416
|
+
position, not the row that would have been there before the swap.
|
|
417
|
+
|
|
322
418
|
Gradio inspects ``__defaults__`` when registering events, and comparing a
|
|
323
419
|
DataFrame default against a type annotation hits an unimplemented arrow
|
|
324
420
|
dtype path. A closure keeps the state out of the function signature.
|
|
325
421
|
"""
|
|
326
422
|
|
|
327
|
-
def _on_select(evt: gr.SelectData) -> tuple[str, Any]:
|
|
423
|
+
def _on_select(table_value: Any, evt: gr.SelectData) -> tuple[str, Any]:
|
|
328
424
|
if evt.index is None:
|
|
329
425
|
return ("_Click a row to load per-language metrics._", None)
|
|
330
426
|
row_idx = evt.index[0] if isinstance(evt.index, list | tuple) else evt.index
|
|
331
427
|
try:
|
|
332
|
-
|
|
333
|
-
|
|
428
|
+
data = table_value.get("data") if isinstance(table_value, dict) else None
|
|
429
|
+
if data is None:
|
|
430
|
+
return ("_Click a row to load per-language metrics._", None)
|
|
431
|
+
model_id = data[row_idx][0]
|
|
432
|
+
except (IndexError, KeyError, TypeError):
|
|
334
433
|
return ("_Could not resolve clicked row._", None)
|
|
335
434
|
per_lang = _per_language_drilldown(snapshot_root, dataset_id, model_id)
|
|
336
435
|
return (
|
|
@@ -341,6 +440,19 @@ def _make_select_handler(
|
|
|
341
440
|
return _on_select
|
|
342
441
|
|
|
343
442
|
|
|
443
|
+
def _make_scope_handler(sub_df: Any) -> Any:
|
|
444
|
+
"""Build the scope-radio change callback: swap the table data + legend in lockstep."""
|
|
445
|
+
|
|
446
|
+
def _on_change(scope: Scope) -> tuple[Any, str]:
|
|
447
|
+
help_items = _HEADLINE_COLUMN_HELP_COV if scope == "cov" else _HEADLINE_COLUMN_HELP
|
|
448
|
+
return (
|
|
449
|
+
_styled_value(_format_table(sub_df, scope=scope)),
|
|
450
|
+
_columns_help_markdown(help_items),
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
return _on_change
|
|
454
|
+
|
|
455
|
+
|
|
344
456
|
def build_app(
|
|
345
457
|
*,
|
|
346
458
|
repo_id: str = DEFAULT_REPO_ID,
|
|
@@ -384,7 +496,7 @@ def build_app(
|
|
|
384
496
|
with gr.Tab(label=tab_label):
|
|
385
497
|
gr.Markdown(_dataset_metadata_markdown(dataset_id))
|
|
386
498
|
sub = df[df["dataset_id"] == dataset_id]
|
|
387
|
-
table = _format_table(sub)
|
|
499
|
+
table = _format_table(sub, scope="all")
|
|
388
500
|
if table.empty:
|
|
389
501
|
gr.Markdown(
|
|
390
502
|
f"_No results for `{dataset_id}` in `{repo_id}` yet."
|
|
@@ -394,6 +506,12 @@ def build_app(
|
|
|
394
506
|
)
|
|
395
507
|
continue
|
|
396
508
|
|
|
509
|
+
scope_radio = gr.Radio(
|
|
510
|
+
choices=SCOPE_CHOICES,
|
|
511
|
+
value="all",
|
|
512
|
+
label="Scoring scope",
|
|
513
|
+
interactive=True,
|
|
514
|
+
)
|
|
397
515
|
leaderboard = gr.Dataframe(
|
|
398
516
|
value=_styled_value(table),
|
|
399
517
|
datatype=_HEADLINE_DATATYPES,
|
|
@@ -402,7 +520,7 @@ def build_app(
|
|
|
402
520
|
label=f"{dataset_id} — sorted by Macro F1",
|
|
403
521
|
)
|
|
404
522
|
with gr.Accordion("What do these columns mean?", open=False):
|
|
405
|
-
gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP))
|
|
523
|
+
legend = gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP))
|
|
406
524
|
drilldown_label = gr.Markdown("_Click a row to load per-language metrics._")
|
|
407
525
|
# Seed the drilldown grid with an empty DataFrame so the Component
|
|
408
526
|
# has stable column headers before the first row click.
|
|
@@ -415,8 +533,14 @@ def build_app(
|
|
|
415
533
|
with gr.Accordion("What do these per-language columns mean?", open=False):
|
|
416
534
|
gr.Markdown(_columns_help_markdown(_DRILLDOWN_COLUMN_HELP))
|
|
417
535
|
|
|
536
|
+
scope_radio.change(
|
|
537
|
+
_make_scope_handler(sub),
|
|
538
|
+
inputs=[scope_radio],
|
|
539
|
+
outputs=[leaderboard, legend],
|
|
540
|
+
)
|
|
418
541
|
leaderboard.select(
|
|
419
|
-
_make_select_handler(dataset_id,
|
|
542
|
+
_make_select_handler(dataset_id, snapshot_root),
|
|
543
|
+
inputs=[leaderboard],
|
|
420
544
|
outputs=[drilldown_label, drilldown],
|
|
421
545
|
)
|
|
422
546
|
gr.Markdown(footer)
|
|
@@ -14,11 +14,15 @@ from __future__ import annotations
|
|
|
14
14
|
|
|
15
15
|
import json
|
|
16
16
|
import logging
|
|
17
|
-
|
|
17
|
+
import math
|
|
18
|
+
from collections.abc import Iterable, Mapping
|
|
18
19
|
from dataclasses import asdict, dataclass
|
|
19
20
|
from pathlib import Path
|
|
20
21
|
from typing import Any
|
|
21
22
|
|
|
23
|
+
from commonlid.metrics.core import LanguageMetrics
|
|
24
|
+
from commonlid.metrics.fpr import mean_false_positive_rate, mean_stats_with_coverage
|
|
25
|
+
|
|
22
26
|
logger = logging.getLogger(__name__)
|
|
23
27
|
|
|
24
28
|
DEFAULT_REPO_ID = "commoncrawl/commonlid-results"
|
|
@@ -40,6 +44,14 @@ class LeaderboardRow:
|
|
|
40
44
|
gold set. That's a model-property number, not a paper headline, and
|
|
41
45
|
it stays consistent across rows: every model is reported on the same
|
|
42
46
|
"what languages did you actually output here" basis.
|
|
47
|
+
|
|
48
|
+
The ``*_cov`` mirror fields are the same metrics restricted to gold
|
|
49
|
+
samples whose language is in the model's declared support set
|
|
50
|
+
(``supported_languages``). They are ``None`` when no support set is
|
|
51
|
+
available — either the field is missing from ``summary.json`` (legacy
|
|
52
|
+
file), the field is JSON ``null`` (LLM-style models whose support set
|
|
53
|
+
is undefined), or the field is an empty list (degenerate "supports
|
|
54
|
+
zero languages"). All three render as em-dashes in the cov view.
|
|
43
55
|
"""
|
|
44
56
|
|
|
45
57
|
dataset_id: str
|
|
@@ -57,6 +69,13 @@ class LeaderboardRow:
|
|
|
57
69
|
commonlid_version: str
|
|
58
70
|
timestamp: str
|
|
59
71
|
is_imported: bool
|
|
72
|
+
supported_languages: list[str] | None
|
|
73
|
+
macro_f1_cov: float | None
|
|
74
|
+
macro_precision_cov: float | None
|
|
75
|
+
macro_recall_cov: float | None
|
|
76
|
+
micro_f1_cov: float | None
|
|
77
|
+
mean_fpr_cov: float | None
|
|
78
|
+
n_languages_cov: int | None
|
|
60
79
|
|
|
61
80
|
def to_dict(self) -> dict[str, Any]:
|
|
62
81
|
return asdict(self)
|
|
@@ -68,10 +87,107 @@ def _safe_mean_fpr(per_language: dict[str, dict[str, Any]]) -> float:
|
|
|
68
87
|
return sum(vals) / len(vals) if vals else 0.0
|
|
69
88
|
|
|
70
89
|
|
|
90
|
+
def _hydrate_per_language(
|
|
91
|
+
per_language: Mapping[str, Mapping[str, Any]],
|
|
92
|
+
) -> dict[str, LanguageMetrics]:
|
|
93
|
+
"""Reconstruct :class:`LanguageMetrics` objects from the serialised dict form."""
|
|
94
|
+
out: dict[str, LanguageMetrics] = {}
|
|
95
|
+
for lang, m in per_language.items():
|
|
96
|
+
out[lang] = LanguageMetrics(
|
|
97
|
+
gt_count=int(m.get("gt_count", 0)),
|
|
98
|
+
predictions=int(m.get("predictions", 0)),
|
|
99
|
+
correct=int(m.get("correct", 0)),
|
|
100
|
+
precision=float(m.get("precision", 0.0) or 0.0),
|
|
101
|
+
recall=float(m.get("recall", 0.0) or 0.0),
|
|
102
|
+
f1=float(m.get("f1", 0.0) or 0.0),
|
|
103
|
+
fpr=None if m.get("fpr") is None else float(m["fpr"]),
|
|
104
|
+
)
|
|
105
|
+
return out
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _micro_average_over(rows: Mapping[str, LanguageMetrics]) -> tuple[float, float, float]:
|
|
109
|
+
"""Pooled precision/recall/F1 over a (filtered) per-language slice.
|
|
110
|
+
|
|
111
|
+
Mirrors :func:`commonlid.metrics.aggregate.micro_average`'s
|
|
112
|
+
``*_gold_only`` math but accepts a pre-filtered subset, which the
|
|
113
|
+
public helper does not.
|
|
114
|
+
"""
|
|
115
|
+
total_correct = sum(m.correct for m in rows.values())
|
|
116
|
+
total_predictions = sum(m.predictions for m in rows.values())
|
|
117
|
+
total_gold = sum(m.gt_count for m in rows.values())
|
|
118
|
+
precision = total_correct / total_predictions if total_predictions > 0 else 0.0
|
|
119
|
+
recall = total_correct / total_gold if total_gold > 0 else 0.0
|
|
120
|
+
f1 = (
|
|
121
|
+
2 * precision * recall / (precision + recall)
|
|
122
|
+
if (precision + recall) > 0 and not math.isclose(precision + recall, 0.0)
|
|
123
|
+
else 0.0
|
|
124
|
+
)
|
|
125
|
+
return precision, recall, f1
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _compute_cov_fields(
|
|
129
|
+
per_language_raw: Mapping[str, Mapping[str, Any]],
|
|
130
|
+
supported: list[str] | None,
|
|
131
|
+
) -> dict[str, float | int | None]:
|
|
132
|
+
"""Return the six cov-variant fields, or all ``None`` when no cov data.
|
|
133
|
+
|
|
134
|
+
``supported`` semantics:
|
|
135
|
+
|
|
136
|
+
- ``None`` — model's support set is undefined (e.g. LLM); no cov data.
|
|
137
|
+
- ``[]`` — model declared zero supported languages; every cov metric
|
|
138
|
+
would divide by zero, so render as no-data.
|
|
139
|
+
- non-empty list — compute the cov metrics.
|
|
140
|
+
"""
|
|
141
|
+
none_result: dict[str, float | int | None] = {
|
|
142
|
+
"macro_f1_cov": None,
|
|
143
|
+
"macro_precision_cov": None,
|
|
144
|
+
"macro_recall_cov": None,
|
|
145
|
+
"micro_f1_cov": None,
|
|
146
|
+
"mean_fpr_cov": None,
|
|
147
|
+
"n_languages_cov": None,
|
|
148
|
+
}
|
|
149
|
+
if not supported:
|
|
150
|
+
return none_result
|
|
151
|
+
|
|
152
|
+
supported_set = set(supported)
|
|
153
|
+
per_language = _hydrate_per_language(per_language_raw)
|
|
154
|
+
stats = mean_stats_with_coverage(per_language, model_supported_languages=supported_set)
|
|
155
|
+
cov = stats["cov"]
|
|
156
|
+
n_languages_cov = int(cov.get("cov_count", 0))
|
|
157
|
+
if n_languages_cov == 0:
|
|
158
|
+
# Supported set has no overlap with the dataset's gold; nothing
|
|
159
|
+
# meaningful to report.
|
|
160
|
+
return none_result
|
|
161
|
+
|
|
162
|
+
cov_rows = {
|
|
163
|
+
lang: m for lang, m in per_language.items() if m.gt_count > 0 and lang in supported_set
|
|
164
|
+
}
|
|
165
|
+
_micro_precision, _micro_recall, micro_f1 = _micro_average_over(cov_rows)
|
|
166
|
+
mean_fpr_cov = mean_false_positive_rate(per_language, language_whitelist=supported_set)
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
"macro_f1_cov": float(cov["f1"]),
|
|
170
|
+
"macro_precision_cov": float(cov["precision"]),
|
|
171
|
+
"macro_recall_cov": float(cov["recall"]),
|
|
172
|
+
"micro_f1_cov": float(micro_f1),
|
|
173
|
+
"mean_fpr_cov": float(mean_fpr_cov),
|
|
174
|
+
"n_languages_cov": n_languages_cov,
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
|
|
71
178
|
def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -> LeaderboardRow:
|
|
72
179
|
macro = summary.get("macro", {})
|
|
73
180
|
micro = summary.get("micro", {})
|
|
74
181
|
extra = summary.get("extra", {}) or {}
|
|
182
|
+
per_language = summary.get("per_language", {}) or {}
|
|
183
|
+
|
|
184
|
+
# Tri-state: missing key, JSON null, or list. Anything else (e.g. an
|
|
185
|
+
# accidentally-serialised set) collapses to "unknown".
|
|
186
|
+
raw_supported = summary.get("supported_languages")
|
|
187
|
+
supported: list[str] | None = list(raw_supported) if isinstance(raw_supported, list) else None
|
|
188
|
+
cov = _compute_cov_fields(per_language, supported)
|
|
189
|
+
|
|
190
|
+
n_languages_cov = cov["n_languages_cov"]
|
|
75
191
|
return LeaderboardRow(
|
|
76
192
|
dataset_id=dataset_id,
|
|
77
193
|
model_id=model_id,
|
|
@@ -79,7 +195,7 @@ def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -
|
|
|
79
195
|
macro_precision=float(macro.get("precision_gold_only", 0.0)),
|
|
80
196
|
macro_recall=float(macro.get("recall_gold_only", 0.0)),
|
|
81
197
|
micro_f1=float(micro.get("f1_gold_only", 0.0)),
|
|
82
|
-
mean_fpr=_safe_mean_fpr(
|
|
198
|
+
mean_fpr=_safe_mean_fpr(per_language),
|
|
83
199
|
n_languages=int(macro.get("n_languages_observed", 0)),
|
|
84
200
|
n_samples=int(summary.get("n_samples", 0)),
|
|
85
201
|
n_samples_with_gold=int(summary.get("n_samples_with_gold", 0)),
|
|
@@ -88,6 +204,13 @@ def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -
|
|
|
88
204
|
commonlid_version=str(summary.get("commonlid_version", "")),
|
|
89
205
|
timestamp=str(summary.get("timestamp", "")),
|
|
90
206
|
is_imported=("imported_from" in extra),
|
|
207
|
+
supported_languages=supported,
|
|
208
|
+
macro_f1_cov=cov["macro_f1_cov"],
|
|
209
|
+
macro_precision_cov=cov["macro_precision_cov"],
|
|
210
|
+
macro_recall_cov=cov["macro_recall_cov"],
|
|
211
|
+
micro_f1_cov=cov["micro_f1_cov"],
|
|
212
|
+
mean_fpr_cov=cov["mean_fpr_cov"],
|
|
213
|
+
n_languages_cov=int(n_languages_cov) if n_languages_cov is not None else None,
|
|
91
214
|
)
|
|
92
215
|
|
|
93
216
|
|
|
@@ -11,6 +11,7 @@ directly if you want to evaluate an LLM.
|
|
|
11
11
|
from commonlid.models import afrolid as _afrolid # noqa: F401
|
|
12
12
|
from commonlid.models import cld2 as _cld2 # noqa: F401
|
|
13
13
|
from commonlid.models import cld3 as _cld3 # noqa: F401
|
|
14
|
+
from commonlid.models import commonlingua as _commonlingua # noqa: F401
|
|
14
15
|
from commonlid.models import fasttext_ft as _fasttext_ft # noqa: F401
|
|
15
16
|
from commonlid.models import funlangid as _funlangid # noqa: F401
|
|
16
17
|
from commonlid.models import glotlid as _glotlid # noqa: F401
|