commonlid 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {commonlid-0.2.2 → commonlid-0.2.4}/PKG-INFO +8 -2
- {commonlid-0.2.2 → commonlid-0.2.4}/README.md +5 -1
- commonlid-0.2.4/docs/contributing/adding_a_model.md +84 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/pyproject.toml +9 -2
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/evaluation/evaluator.py +16 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/evaluation/results.py +12 -2
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/leaderboard/app.py +149 -21
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/leaderboard/data.py +125 -2
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/__init__.py +1 -0
- commonlid-0.2.4/src/commonlid/models/commonlingua.py +115 -0
- commonlid-0.2.4/src/commonlid/vendor/commonlingua/__init__.py +4 -0
- commonlid-0.2.4/src/commonlid/vendor/commonlingua/model.py +186 -0
- commonlid-0.2.4/tests/models/test_commonlingua.py +95 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_model_registration.py +1 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_leaderboard_data.py +227 -3
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_results_io.py +30 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/.gitignore +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/LICENSE +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/docs/architecture.md +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/hf-space/README.md +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/notebooks/README.md +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/_version.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/cli.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/core/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/core/lid_dataset.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/core/lid_model.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/core/registry.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/bibles.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/commonlid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/flores_dev.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/nano.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/smolsent.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/social_media.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/udhr.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets_tools/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets_tools/frequency_sample.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets_tools/stratified_sample.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/evaluation/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/evaluation/cache.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/leaderboard/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/logging.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/metrics/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/metrics/aggregate.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/metrics/core.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/metrics/fpr.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/metrics/support_matrix.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/_fasttext_base.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/afrolid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/cld2.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/cld3.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/dspy_llm.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/fasttext_ft.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/funlangid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/glotlid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/openlidv2.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/pyfranc.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/preprocess/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/preprocess/langcodes.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/preprocess/openlid_normer.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/py.typed +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/vendor/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/vendor/fun_langid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/conftest.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/fixtures/preprocess_golden.jsonl +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/fixtures/tiny_lid.jsonl +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/fixtures/tiny_support_matrix.csv +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/hf-space/test_space_entrypoint.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_bibles_build_vs_cache.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_cli_end_to_end.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_cli_generate_support_matrix.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_nano_build_vs_cache.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_readme_examples.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_smoke_parity.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_smoke_parity_commonlid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_smolsent_build_vs_cache.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/legacy/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/legacy/langid_datasets.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/legacy/langid_models.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_cld2.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_cld3.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_discover_supported_languages.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_dspy_llm.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_fasttext_base.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_funlangid.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_pyfranc.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/__init__.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_cache.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_cli_stub.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_dataset_build_from_source.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_dataset_registration.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_evaluator.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_frequency_sample.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_langcodes.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_lid_model_base.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_metrics_aggregate.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_metrics_core.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_metrics_fpr.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_nano_datasets.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_notebook_validity.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_preprocess.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_private_dataset_error.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_registry.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_stratified_sample.py +0 -0
- {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_support_matrix.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: commonlid
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: Evaluate language identification models on CommonLID and other benchmarks.
|
|
5
5
|
Project-URL: Homepage, https://huggingface.co/datasets/commoncrawl/CommonLID
|
|
6
6
|
Project-URL: Paper, https://arxiv.org/abs/2601.18026
|
|
@@ -247,6 +247,8 @@ Requires-Dist: torch>=2.4; extra == 'all'
|
|
|
247
247
|
Requires-Dist: transformers<5,>=4.46; extra == 'all'
|
|
248
248
|
Provides-Extra: cld3
|
|
249
249
|
Requires-Dist: cld3-py>=3.1; extra == 'cld3'
|
|
250
|
+
Provides-Extra: commonlingua
|
|
251
|
+
Requires-Dist: torch>=2.4; extra == 'commonlingua'
|
|
250
252
|
Provides-Extra: dev
|
|
251
253
|
Requires-Dist: azure-identity>=1.17; extra == 'dev'
|
|
252
254
|
Requires-Dist: botocore>=1.35; extra == 'dev'
|
|
@@ -315,6 +317,7 @@ From PyPI:
|
|
|
315
317
|
pip install commonlid # core deps + classical LID models
|
|
316
318
|
pip install "commonlid[llm]" # + DSPy-based LLM evaluation
|
|
317
319
|
pip install "commonlid[afrolid]" # + torch/transformers for AfroLID
|
|
320
|
+
pip install "commonlid[commonlingua]" # + torch for the CommonLingua byte-level model
|
|
318
321
|
pip install "commonlid[notebooks]" # + jupyterlab + matplotlib for paper_tables.ipynb
|
|
319
322
|
pip install "commonlid[all]" # everything runtime-facing
|
|
320
323
|
```
|
|
@@ -468,7 +471,7 @@ from commonlid import list_models, list_datasets
|
|
|
468
471
|
|
|
469
472
|
assert list_models() == [
|
|
470
473
|
"AfroLID", "GlotLID", "OpenLID-v2", "cld2", "cld3",
|
|
471
|
-
"fasttext", "funlangid", "pyfranc",
|
|
474
|
+
"commonlingua", "fasttext", "funlangid", "pyfranc",
|
|
472
475
|
]
|
|
473
476
|
assert list_datasets() == [
|
|
474
477
|
"bibles_300", "bibles_300_nano",
|
|
@@ -574,6 +577,7 @@ for line in preds_path.read_text().splitlines():
|
|
|
574
577
|
| `fasttext` | [facebook/fasttext-language-identification](https://huggingface.co/facebook/fasttext-language-identification) | fasttext |
|
|
575
578
|
| `pyfranc` | [pyfranc](https://pypi.org/project/pyfranc/) | Pure Python |
|
|
576
579
|
| `AfroLID` | [UBC-NLP/afrolid_1.5](https://huggingface.co/UBC-NLP/afrolid_1.5) | Requires `[afrolid]` extra |
|
|
580
|
+
| `commonlingua` | [PleIAs/CommonLingua](https://huggingface.co/PleIAs/CommonLingua) | 2.35M-param byte-level model, 334 languages; requires `[commonlingua]` extra |
|
|
577
581
|
| `funlangid` | Vendored in `src/commonlid/vendor/fun_langid.py` | Simple char-4gram baseline |
|
|
578
582
|
|
|
579
583
|
LLM models are instantiated dynamically (`DSPyLLMModel`) and not
|
|
@@ -704,6 +708,8 @@ exclude the `und` bucket by default (toggle with `include_und=True`).
|
|
|
704
708
|
|
|
705
709
|
## Adding a new model
|
|
706
710
|
|
|
711
|
+
A guide for adding a new model can be found [here](docs/contributing/adding_a_model.md).
|
|
712
|
+
|
|
707
713
|
<!-- readme-test: fast; id=add-model (registers into an isolated registry) -->
|
|
708
714
|
```python
|
|
709
715
|
# src/commonlid/models/my_model.py
|
|
@@ -39,6 +39,7 @@ From PyPI:
|
|
|
39
39
|
pip install commonlid # core deps + classical LID models
|
|
40
40
|
pip install "commonlid[llm]" # + DSPy-based LLM evaluation
|
|
41
41
|
pip install "commonlid[afrolid]" # + torch/transformers for AfroLID
|
|
42
|
+
pip install "commonlid[commonlingua]" # + torch for the CommonLingua byte-level model
|
|
42
43
|
pip install "commonlid[notebooks]" # + jupyterlab + matplotlib for paper_tables.ipynb
|
|
43
44
|
pip install "commonlid[all]" # everything runtime-facing
|
|
44
45
|
```
|
|
@@ -192,7 +193,7 @@ from commonlid import list_models, list_datasets
|
|
|
192
193
|
|
|
193
194
|
assert list_models() == [
|
|
194
195
|
"AfroLID", "GlotLID", "OpenLID-v2", "cld2", "cld3",
|
|
195
|
-
"fasttext", "funlangid", "pyfranc",
|
|
196
|
+
"commonlingua", "fasttext", "funlangid", "pyfranc",
|
|
196
197
|
]
|
|
197
198
|
assert list_datasets() == [
|
|
198
199
|
"bibles_300", "bibles_300_nano",
|
|
@@ -298,6 +299,7 @@ for line in preds_path.read_text().splitlines():
|
|
|
298
299
|
| `fasttext` | [facebook/fasttext-language-identification](https://huggingface.co/facebook/fasttext-language-identification) | fasttext |
|
|
299
300
|
| `pyfranc` | [pyfranc](https://pypi.org/project/pyfranc/) | Pure Python |
|
|
300
301
|
| `AfroLID` | [UBC-NLP/afrolid_1.5](https://huggingface.co/UBC-NLP/afrolid_1.5) | Requires `[afrolid]` extra |
|
|
302
|
+
| `commonlingua` | [PleIAs/CommonLingua](https://huggingface.co/PleIAs/CommonLingua) | 2.35M-param byte-level model, 334 languages; requires `[commonlingua]` extra |
|
|
301
303
|
| `funlangid` | Vendored in `src/commonlid/vendor/fun_langid.py` | Simple char-4gram baseline |
|
|
302
304
|
|
|
303
305
|
LLM models are instantiated dynamically (`DSPyLLMModel`) and not
|
|
@@ -428,6 +430,8 @@ exclude the `und` bucket by default (toggle with `include_und=True`).
|
|
|
428
430
|
|
|
429
431
|
## Adding a new model
|
|
430
432
|
|
|
433
|
+
A guide for adding a new model can be found [here](docs/contributing/adding_a_model.md).
|
|
434
|
+
|
|
431
435
|
<!-- readme-test: fast; id=add-model (registers into an isolated registry) -->
|
|
432
436
|
```python
|
|
433
437
|
# src/commonlid/models/my_model.py
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# Adding a model to the leaderboard
|
|
2
|
+
|
|
3
|
+
The CommonLID leaderboard is available [here](https://huggingface.co/spaces/commoncrawl/commonlid).
|
|
4
|
+
|
|
5
|
+
1. Add the [model implementation](#adding-a-model-implementation) to `commonlid`
|
|
6
|
+
2. [Evaluate](#evaluate-new-model) the desired model using `commonlid` on the benchmarks
|
|
7
|
+
3. Push the results to the [results repository](https://huggingface.co/datasets/commoncrawl/commonlid-results) via a PR. Once merged they will appear on the leaderboard.
|
|
8
|
+
|
|
9
|
+
## Requesting an evaluation
|
|
10
|
+
|
|
11
|
+
If you want a model to be evaluated but are not submitting the results yourself, open an issue instead and provide the required information.
|
|
12
|
+
|
|
13
|
+
## Adding a model implementation
|
|
14
|
+
|
|
15
|
+
Adding a model implementation to `commonlid` is quite straightforward. Typically, it only requires that you provide the text-to-language prediction method and add it to the [model directory](https://github.com/commoncrawl/commonlid-eval/tree/main/src/commonlid/models):
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
# src/commonlid/models/my_model.py
|
|
19
|
+
from collections.abc import Sequence
|
|
20
|
+
|
|
21
|
+
from commonlid.core.lid_model import LIDModel
|
|
22
|
+
from commonlid.core.registry import get_model, register_model
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@register_model
|
|
26
|
+
class MyModel(LIDModel):
|
|
27
|
+
model_id = "my_model"
|
|
28
|
+
|
|
29
|
+
def _predict_batch(self, texts: Sequence[str]) -> list[str | None]:
|
|
30
|
+
# Return one ISO 639-3 code (or None for undetermined) per input.
|
|
31
|
+
# `texts` arrives post-OpenLID-normer cleaning by default;
|
|
32
|
+
# set `requires_preprocessing = False` to receive raw text.
|
|
33
|
+
return ["eng"] * len(texts)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
assert get_model("my_model").predict(["hi"]) == ["eng"]
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Then import it from `src/commonlid/models/__init__.py` so the decorator
|
|
40
|
+
fires on `import commonlid`:
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from commonlid.models import my_model as _my_model # noqa: F401
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
### Adding model dependencies
|
|
48
|
+
|
|
49
|
+
If you are adding a model that requires additional dependencies, you can add them to the `pyproject.toml` file, under optional dependencies:
|
|
50
|
+
|
|
51
|
+
```toml
|
|
52
|
+
cld3 = ["cld3-py>=3.1"]
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
This ensures that the implementation does not break if a package is updated.
|
|
56
|
+
|
|
57
|
+
As it is an optional dependency, you can't use top-level dependencies, but will instead have to use import inside the wrapper scope.
|
|
58
|
+
|
|
59
|
+
## Evaluate new model
|
|
60
|
+
|
|
61
|
+
As soon as the model implementation is registered, you can run this command to evaluate your model on CommonLID and its nano version:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
commonlid run \
|
|
65
|
+
--model my_model \
|
|
66
|
+
--dataset commonlid --dataset commonlid_nano \
|
|
67
|
+
--output-dir ./data/results
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
You may indeed reinstall the `commonlid` package with your changes if the package was not installed in editable mode.
|
|
71
|
+
|
|
72
|
+
## Uploading the results data (PR-based)
|
|
73
|
+
|
|
74
|
+
After running the evaluation locally, you can upload the results to our [HF results repository](https://huggingface.co/datasets/commoncrawl/commonlid-results) as follows:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
hf auth login # token with write access to the results dataset
|
|
78
|
+
make leaderboard-upload # opens a Pull Request from ./data/results
|
|
79
|
+
# Override the target with: make leaderboard-upload LEADERBOARD_REPO=other/repo LEADERBOARD_DIR=./elsewhere
|
|
80
|
+
# Optional: pass --skip-predictions via `uv run commonlid leaderboard upload ...` directly.
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
The CLI always opens a Pull Request rather than pushing to the default
|
|
84
|
+
branch, so the dataset owner reviews before merging.
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "commonlid"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.4"
|
|
8
8
|
description = "Evaluate language identification models on CommonLID and other benchmarks."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -59,6 +59,11 @@ llm = [
|
|
|
59
59
|
"botocore>=1.35",
|
|
60
60
|
]
|
|
61
61
|
cld3 = ["cld3-py>=3.1"]
|
|
62
|
+
commonlingua = [
|
|
63
|
+
# CommonLingua is a 2.35M-param byte-level model; needs torch but not the
|
|
64
|
+
# transformers stack that [afrolid] pulls in.
|
|
65
|
+
"torch>=2.4",
|
|
66
|
+
]
|
|
62
67
|
leaderboard = [
|
|
63
68
|
# gradio 4.x imports HfFolder from huggingface_hub, which was removed in
|
|
64
69
|
# huggingface-hub 1.0; gradio 5 dropped that import.
|
|
@@ -88,7 +93,7 @@ notebooks = [
|
|
|
88
93
|
"nbclient>=0.10",
|
|
89
94
|
]
|
|
90
95
|
all = [
|
|
91
|
-
"commonlid[afrolid,llm]",
|
|
96
|
+
"commonlid[afrolid,llm,commonlingua]",
|
|
92
97
|
]
|
|
93
98
|
|
|
94
99
|
[project.scripts]
|
|
@@ -208,6 +213,8 @@ omit = [
|
|
|
208
213
|
# afrolid needs the heavy `[afrolid]` extra (torch + transformers); not
|
|
209
214
|
# installed in dev and so exercised only via mocked unit tests.
|
|
210
215
|
"src/commonlid/models/afrolid.py",
|
|
216
|
+
# commonlingua needs the `[commonlingua]` extra (torch); same precedent.
|
|
217
|
+
"src/commonlid/models/commonlingua.py",
|
|
211
218
|
]
|
|
212
219
|
|
|
213
220
|
[tool.coverage.report]
|
|
@@ -159,6 +159,21 @@ class Evaluator:
|
|
|
159
159
|
)
|
|
160
160
|
n_with_gold = sum(1 for g in ytrue if g is not None)
|
|
161
161
|
samples_per_second = (len(ytrue) / elapsed) if elapsed > 0 else 0.0
|
|
162
|
+
# `None` here is meaningful: it tells downstream consumers that the
|
|
163
|
+
# model's support set is undefined (e.g. LLMs), distinct from a model
|
|
164
|
+
# that declared an empty set. Errors during discovery downgrade to
|
|
165
|
+
# the same "unknown" sentinel rather than crashing the run.
|
|
166
|
+
try:
|
|
167
|
+
supported = model.discover_supported_languages()
|
|
168
|
+
except Exception as exc:
|
|
169
|
+
logger.warning(
|
|
170
|
+
"%s discover_supported_languages() raised %s: %s -- recording as None",
|
|
171
|
+
prefix,
|
|
172
|
+
type(exc).__name__,
|
|
173
|
+
exc,
|
|
174
|
+
)
|
|
175
|
+
supported = None
|
|
176
|
+
supported_languages = sorted(supported) if supported is not None else None
|
|
162
177
|
result = Result(
|
|
163
178
|
model_id=model.model_id,
|
|
164
179
|
dataset_id=dataset.dataset_id,
|
|
@@ -170,6 +185,7 @@ class Evaluator:
|
|
|
170
185
|
limit=self.config.limit,
|
|
171
186
|
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
172
187
|
commonlid_version=__version__,
|
|
188
|
+
supported_languages=supported_languages,
|
|
173
189
|
)
|
|
174
190
|
|
|
175
191
|
run_dir = self.config.output_dir / dataset.dataset_id / model.model_id
|
|
@@ -13,12 +13,20 @@ from typing import Any
|
|
|
13
13
|
from commonlid.metrics.aggregate import macro_average, micro_average
|
|
14
14
|
from commonlid.metrics.core import LanguageMetrics
|
|
15
15
|
|
|
16
|
-
SCHEMA_VERSION =
|
|
16
|
+
SCHEMA_VERSION = 3
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
@dataclass(slots=True)
|
|
20
20
|
class Result:
|
|
21
|
-
"""Aggregate outcome of one model evaluated on one dataset.
|
|
21
|
+
"""Aggregate outcome of one model evaluated on one dataset.
|
|
22
|
+
|
|
23
|
+
``supported_languages`` follows a tri-state convention shared with
|
|
24
|
+
:meth:`LIDModel.discover_supported_languages`: ``None`` means the
|
|
25
|
+
model's support set is undefined (e.g. LLM-based models that can be
|
|
26
|
+
prompted for any language), a list of ISO 639-3 codes is the closed
|
|
27
|
+
set the model declares, and an empty list is the degenerate "supports
|
|
28
|
+
zero languages" case. The leaderboard's ``(cov.)`` view consumes this.
|
|
29
|
+
"""
|
|
22
30
|
|
|
23
31
|
model_id: str
|
|
24
32
|
dataset_id: str
|
|
@@ -32,6 +40,7 @@ class Result:
|
|
|
32
40
|
commonlid_version: str = ""
|
|
33
41
|
python_version: str = field(default_factory=lambda: sys.version.split()[0])
|
|
34
42
|
platform: str = field(default_factory=platform.platform)
|
|
43
|
+
supported_languages: list[str] | None = None
|
|
35
44
|
extra: dict[str, Any] = field(default_factory=dict)
|
|
36
45
|
|
|
37
46
|
def summary(self) -> dict[str, Any]:
|
|
@@ -52,6 +61,7 @@ class Result:
|
|
|
52
61
|
"macro": macro_average(self.per_language),
|
|
53
62
|
"micro": micro_average(self.per_language),
|
|
54
63
|
"per_language": {lang: asdict(m) for lang, m in sorted(self.per_language.items())},
|
|
64
|
+
"supported_languages": self.supported_languages,
|
|
55
65
|
"extra": self.extra,
|
|
56
66
|
}
|
|
57
67
|
|
|
@@ -42,6 +42,26 @@ BLOG_URL = (
|
|
|
42
42
|
)
|
|
43
43
|
PAPER_URL = "https://arxiv.org/abs/2601.18026"
|
|
44
44
|
|
|
45
|
+
WEBSITE_URL = "https://commonlid.org/"
|
|
46
|
+
|
|
47
|
+
NEW_MODEL_URL = (
|
|
48
|
+
"https://github.com/commoncrawl/commonlid-eval/blob/main/docs/contributing/adding_a_model.md"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
Scope = Literal["all", "cov"]
|
|
52
|
+
|
|
53
|
+
#: Radio choices shown above each dataset's results table.
|
|
54
|
+
SCOPE_CHOICES: list[tuple[str, Scope]] = [
|
|
55
|
+
("Scores are calculated over the whole dataset.", "all"),
|
|
56
|
+
(
|
|
57
|
+
"Scores are calculated on the subset of language varieties covered by the model. (cov.)",
|
|
58
|
+
"cov",
|
|
59
|
+
),
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
#: Sentinel string used when a row has no cov data (rendered as em-dash).
|
|
63
|
+
_NA_DISPLAY = "—"
|
|
64
|
+
|
|
45
65
|
#: Display columns in the headline table (in order). Macro F1 is the headline metric.
|
|
46
66
|
_HEADLINE_COLUMNS: list[tuple[str, str]] = [
|
|
47
67
|
("model_id", "Model"),
|
|
@@ -51,6 +71,19 @@ _HEADLINE_COLUMNS: list[tuple[str, str]] = [
|
|
|
51
71
|
("n_languages", "Languages"),
|
|
52
72
|
("samples_per_second", "Samples/s"),
|
|
53
73
|
]
|
|
74
|
+
|
|
75
|
+
#: Same columns, projected from the ``*_cov`` source fields. Display
|
|
76
|
+
#: labels stay identical so the table layout doesn't shift when the
|
|
77
|
+
#: scope radio is toggled.
|
|
78
|
+
_HEADLINE_COLUMNS_COV: list[tuple[str, str]] = [
|
|
79
|
+
("model_id", "Model"),
|
|
80
|
+
("macro_f1_cov", "Macro F1"),
|
|
81
|
+
("micro_f1_cov", "Micro F1"),
|
|
82
|
+
("mean_fpr_cov", "Mean FPR (%)"),
|
|
83
|
+
("n_languages_cov", "Languages"),
|
|
84
|
+
("samples_per_second", "Samples/s"),
|
|
85
|
+
]
|
|
86
|
+
|
|
54
87
|
#: Right-aligned numeric columns get the ``number`` Gradio datatype which
|
|
55
88
|
#: pushes values to the right edge of the cell.
|
|
56
89
|
_GradioDtype = Literal["str", "number", "bool", "date", "markdown", "html"]
|
|
@@ -134,6 +167,46 @@ _DRILLDOWN_COLUMN_HELP: list[tuple[str, str]] = [
|
|
|
134
167
|
]
|
|
135
168
|
|
|
136
169
|
|
|
170
|
+
#: Per-column human descriptions for the **(cov.)** view — same metrics,
|
|
171
|
+
#: but restricted to the model's declared support set.
|
|
172
|
+
_HEADLINE_COLUMN_HELP_COV: list[tuple[str, str]] = [
|
|
173
|
+
("Model", "Identifier of the language identification model."),
|
|
174
|
+
(
|
|
175
|
+
"Macro F1",
|
|
176
|
+
"Unweighted mean of per-language F1 (x100) **restricted to languages the "
|
|
177
|
+
"model declares it supports** (paper `(cov.)` definition). Languages outside "
|
|
178
|
+
"the model's support set are excluded from the average — a model that covers "
|
|
179
|
+
"a small but accurate subset of the benchmark is no longer penalised for the "
|
|
180
|
+
"long tail of languages it never claimed to handle. **Higher is better.** "
|
|
181
|
+
f"Models without a declared support set show `{_NA_DISPLAY}`.",
|
|
182
|
+
),
|
|
183
|
+
(
|
|
184
|
+
"Micro F1",
|
|
185
|
+
"Sample-weighted F1 (x100) pooled over the **model-supported subset** of "
|
|
186
|
+
"gold samples only. **Higher is better.** "
|
|
187
|
+
f"`{_NA_DISPLAY}` when no support set is declared.",
|
|
188
|
+
),
|
|
189
|
+
(
|
|
190
|
+
"Mean FPR (%)",
|
|
191
|
+
"Mean per-language false-positive rate computed only on samples whose gold "
|
|
192
|
+
"language is in the model's support set; TN counts confusion across other "
|
|
193
|
+
"supported languages, not the long tail. **Lower is better.** "
|
|
194
|
+
f"`{_NA_DISPLAY}` when no support set is declared.",
|
|
195
|
+
),
|
|
196
|
+
(
|
|
197
|
+
"Languages",
|
|
198
|
+
"Number of model-supported languages that have at least one gold sample in "
|
|
199
|
+
"this dataset (`|supported ∩ gold|`). This is the size of the slice every "
|
|
200
|
+
"other `(cov.)` metric is averaged over.",
|
|
201
|
+
),
|
|
202
|
+
(
|
|
203
|
+
"Samples/s",
|
|
204
|
+
"Throughput during evaluation (samples processed per second). Unaffected by "
|
|
205
|
+
"the scope toggle — it is a model-property, not a metric.",
|
|
206
|
+
),
|
|
207
|
+
]
|
|
208
|
+
|
|
209
|
+
|
|
137
210
|
def _columns_help_markdown(items: list[tuple[str, str]]) -> str:
|
|
138
211
|
"""Render a (column, description) list as a Markdown bullet block."""
|
|
139
212
|
return "\n".join(f"- **{label}** — {desc}" for label, desc in items)
|
|
@@ -157,30 +230,55 @@ def _styled_value(table: Any, right_align_after_col: int = 0) -> dict[str, Any]:
|
|
|
157
230
|
return {"data": data, "headers": headers, "metadata": {"styling": styling}}
|
|
158
231
|
|
|
159
232
|
|
|
160
|
-
def
|
|
233
|
+
def _fmt(value: Any, decimals: int, *, scale: float = 1.0) -> str:
|
|
234
|
+
"""Format a numeric value with ``decimals`` precision, em-dash for ``None``/``NaN``."""
|
|
235
|
+
import pandas as pd
|
|
236
|
+
|
|
237
|
+
if value is None or (isinstance(value, float) and pd.isna(value)):
|
|
238
|
+
return _NA_DISPLAY
|
|
239
|
+
return f"{float(value) * scale:.{decimals}f}"
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _format_table(df: Any, scope: Scope = "all") -> Any:
|
|
161
243
|
"""Project + format a results DataFrame for one Gradio tab.
|
|
162
244
|
|
|
163
245
|
Numeric columns are converted to **fixed-decimal strings** (e.g. ``0.00``
|
|
164
246
|
not ``0``) so the rendered cells line up vertically; sort ordering is
|
|
165
|
-
preserved by sorting on the raw
|
|
247
|
+
preserved by sorting on the raw float *before* formatting.
|
|
166
248
|
|
|
167
249
|
- Macro F1 / Micro F1 / Samples/s use **1 decimal**.
|
|
168
250
|
- Mean FPR (%) uses **2 decimals**.
|
|
251
|
+
- In ``scope="cov"``, rows without ``supported_languages`` data render
|
|
252
|
+
em-dashes for every cov metric and sort to the bottom.
|
|
169
253
|
"""
|
|
170
254
|
import pandas as pd
|
|
171
255
|
|
|
256
|
+
columns = _HEADLINE_COLUMNS_COV if scope == "cov" else _HEADLINE_COLUMNS
|
|
257
|
+
display_labels = [label for _, label in columns]
|
|
172
258
|
if df.empty:
|
|
173
|
-
return pd.DataFrame(columns=
|
|
259
|
+
return pd.DataFrame(columns=display_labels)
|
|
260
|
+
|
|
174
261
|
out = df.copy()
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
out
|
|
180
|
-
out
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
262
|
+
source = {key: key for key, _ in columns}
|
|
263
|
+
sort_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"]
|
|
264
|
+
# ``na_position="last"`` sinks rows without cov data to the bottom of
|
|
265
|
+
# the (cov.) view; the "all" view has no NaNs in this column.
|
|
266
|
+
out = out.sort_values(sort_key, ascending=False, kind="stable", na_position="last")
|
|
267
|
+
out = out.reset_index(drop=True)
|
|
268
|
+
|
|
269
|
+
macro_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"]
|
|
270
|
+
micro_key = source["micro_f1_cov"] if scope == "cov" else source["micro_f1"]
|
|
271
|
+
fpr_key = source["mean_fpr_cov"] if scope == "cov" else source["mean_fpr"]
|
|
272
|
+
langs_key = source["n_languages_cov"] if scope == "cov" else source["n_languages"]
|
|
273
|
+
|
|
274
|
+
out[macro_key] = out[macro_key].map(lambda x: _fmt(x, 1, scale=100))
|
|
275
|
+
out[micro_key] = out[micro_key].map(lambda x: _fmt(x, 1, scale=100))
|
|
276
|
+
out[fpr_key] = out[fpr_key].map(lambda x: _fmt(x, 2, scale=100))
|
|
277
|
+
out[langs_key] = out[langs_key].map(lambda x: _fmt(x, 0))
|
|
278
|
+
out["samples_per_second"] = out["samples_per_second"].map(lambda x: _fmt(x, 1))
|
|
279
|
+
|
|
280
|
+
out = out[[k for k, _ in columns]]
|
|
281
|
+
out.columns = display_labels
|
|
184
282
|
return out
|
|
185
283
|
|
|
186
284
|
|
|
@@ -314,23 +412,29 @@ def _format_license(license_name: str, license_url: str | None) -> str:
|
|
|
314
412
|
|
|
315
413
|
def _make_select_handler(
|
|
316
414
|
dataset_id: str,
|
|
317
|
-
table: Any,
|
|
318
415
|
snapshot_root: Path,
|
|
319
416
|
) -> Any:
|
|
320
417
|
"""Build the row-select callback as a closure over the captured state.
|
|
321
418
|
|
|
419
|
+
Uses ``gr.SelectData.row_value`` (Gradio's per-click payload that
|
|
420
|
+
contains the clicked row as a 1-D list) so the drilldown picks up the
|
|
421
|
+
*current* table ordering — switching the scope radio and then clicking
|
|
422
|
+
a row resolves to the row at its post-toggle position. Passing the
|
|
423
|
+
Dataframe component as an event input would not work: Gradio 6
|
|
424
|
+
preprocesses Dataframe inputs into ``pandas.DataFrame`` objects, not
|
|
425
|
+
the ``{"data", "headers"}`` dict we feed in via ``_styled_value``.
|
|
426
|
+
|
|
322
427
|
Gradio inspects ``__defaults__`` when registering events, and comparing a
|
|
323
428
|
DataFrame default against a type annotation hits an unimplemented arrow
|
|
324
429
|
dtype path. A closure keeps the state out of the function signature.
|
|
325
430
|
"""
|
|
326
431
|
|
|
327
432
|
def _on_select(evt: gr.SelectData) -> tuple[str, Any]:
|
|
328
|
-
if evt.index is None:
|
|
433
|
+
if evt.index is None or not evt.row_value:
|
|
329
434
|
return ("_Click a row to load per-language metrics._", None)
|
|
330
|
-
row_idx = evt.index[0] if isinstance(evt.index, list | tuple) else evt.index
|
|
331
435
|
try:
|
|
332
|
-
model_id =
|
|
333
|
-
except (IndexError,
|
|
436
|
+
model_id = evt.row_value[0]
|
|
437
|
+
except (IndexError, TypeError):
|
|
334
438
|
return ("_Could not resolve clicked row._", None)
|
|
335
439
|
per_lang = _per_language_drilldown(snapshot_root, dataset_id, model_id)
|
|
336
440
|
return (
|
|
@@ -341,6 +445,19 @@ def _make_select_handler(
|
|
|
341
445
|
return _on_select
|
|
342
446
|
|
|
343
447
|
|
|
448
|
+
def _make_scope_handler(sub_df: Any) -> Any:
|
|
449
|
+
"""Build the scope-radio change callback: swap the table data + legend in lockstep."""
|
|
450
|
+
|
|
451
|
+
def _on_change(scope: Scope) -> tuple[Any, str]:
|
|
452
|
+
help_items = _HEADLINE_COLUMN_HELP_COV if scope == "cov" else _HEADLINE_COLUMN_HELP
|
|
453
|
+
return (
|
|
454
|
+
_styled_value(_format_table(sub_df, scope=scope)),
|
|
455
|
+
_columns_help_markdown(help_items),
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
return _on_change
|
|
459
|
+
|
|
460
|
+
|
|
344
461
|
def build_app(
|
|
345
462
|
*,
|
|
346
463
|
repo_id: str = DEFAULT_REPO_ID,
|
|
@@ -369,7 +486,7 @@ def build_app(
|
|
|
369
486
|
f"Headline metric: **macro F1**. Models are ranked by macro F1 "
|
|
370
487
|
f"within each tab; click a row to see per-language metrics.\n"
|
|
371
488
|
f"\n"
|
|
372
|
-
f"📝 [Blog post]({BLOG_URL}) • 📄 [Paper]({PAPER_URL})"
|
|
489
|
+
f"🌐 [Website]({WEBSITE_URL}) • 📝 [Blog post]({BLOG_URL}) • 📄 [Paper]({PAPER_URL}) • 🆕 [Add a model]({NEW_MODEL_URL})"
|
|
373
490
|
)
|
|
374
491
|
repo_url = f"https://huggingface.co/datasets/{repo_id}"
|
|
375
492
|
if revision:
|
|
@@ -384,7 +501,7 @@ def build_app(
|
|
|
384
501
|
with gr.Tab(label=tab_label):
|
|
385
502
|
gr.Markdown(_dataset_metadata_markdown(dataset_id))
|
|
386
503
|
sub = df[df["dataset_id"] == dataset_id]
|
|
387
|
-
table = _format_table(sub)
|
|
504
|
+
table = _format_table(sub, scope="all")
|
|
388
505
|
if table.empty:
|
|
389
506
|
gr.Markdown(
|
|
390
507
|
f"_No results for `{dataset_id}` in `{repo_id}` yet."
|
|
@@ -394,6 +511,12 @@ def build_app(
|
|
|
394
511
|
)
|
|
395
512
|
continue
|
|
396
513
|
|
|
514
|
+
scope_radio = gr.Radio(
|
|
515
|
+
choices=SCOPE_CHOICES,
|
|
516
|
+
value="all",
|
|
517
|
+
label="Scoring scope",
|
|
518
|
+
interactive=True,
|
|
519
|
+
)
|
|
397
520
|
leaderboard = gr.Dataframe(
|
|
398
521
|
value=_styled_value(table),
|
|
399
522
|
datatype=_HEADLINE_DATATYPES,
|
|
@@ -402,7 +525,7 @@ def build_app(
|
|
|
402
525
|
label=f"{dataset_id} — sorted by Macro F1",
|
|
403
526
|
)
|
|
404
527
|
with gr.Accordion("What do these columns mean?", open=False):
|
|
405
|
-
gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP))
|
|
528
|
+
legend = gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP))
|
|
406
529
|
drilldown_label = gr.Markdown("_Click a row to load per-language metrics._")
|
|
407
530
|
# Seed the drilldown grid with an empty DataFrame so the Component
|
|
408
531
|
# has stable column headers before the first row click.
|
|
@@ -415,8 +538,13 @@ def build_app(
|
|
|
415
538
|
with gr.Accordion("What do these per-language columns mean?", open=False):
|
|
416
539
|
gr.Markdown(_columns_help_markdown(_DRILLDOWN_COLUMN_HELP))
|
|
417
540
|
|
|
541
|
+
scope_radio.change(
|
|
542
|
+
_make_scope_handler(sub),
|
|
543
|
+
inputs=[scope_radio],
|
|
544
|
+
outputs=[leaderboard, legend],
|
|
545
|
+
)
|
|
418
546
|
leaderboard.select(
|
|
419
|
-
_make_select_handler(dataset_id,
|
|
547
|
+
_make_select_handler(dataset_id, snapshot_root),
|
|
420
548
|
outputs=[drilldown_label, drilldown],
|
|
421
549
|
)
|
|
422
550
|
gr.Markdown(footer)
|