commonlid 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. {commonlid-0.2.2 → commonlid-0.2.4}/PKG-INFO +8 -2
  2. {commonlid-0.2.2 → commonlid-0.2.4}/README.md +5 -1
  3. commonlid-0.2.4/docs/contributing/adding_a_model.md +84 -0
  4. {commonlid-0.2.2 → commonlid-0.2.4}/pyproject.toml +9 -2
  5. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/evaluation/evaluator.py +16 -0
  6. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/evaluation/results.py +12 -2
  7. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/leaderboard/app.py +149 -21
  8. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/leaderboard/data.py +125 -2
  9. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/__init__.py +1 -0
  10. commonlid-0.2.4/src/commonlid/models/commonlingua.py +115 -0
  11. commonlid-0.2.4/src/commonlid/vendor/commonlingua/__init__.py +4 -0
  12. commonlid-0.2.4/src/commonlid/vendor/commonlingua/model.py +186 -0
  13. commonlid-0.2.4/tests/models/test_commonlingua.py +95 -0
  14. {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_model_registration.py +1 -0
  15. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_leaderboard_data.py +227 -3
  16. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_results_io.py +30 -0
  17. {commonlid-0.2.2 → commonlid-0.2.4}/.gitignore +0 -0
  18. {commonlid-0.2.2 → commonlid-0.2.4}/LICENSE +0 -0
  19. {commonlid-0.2.2 → commonlid-0.2.4}/docs/architecture.md +0 -0
  20. {commonlid-0.2.2 → commonlid-0.2.4}/hf-space/README.md +0 -0
  21. {commonlid-0.2.2 → commonlid-0.2.4}/notebooks/README.md +0 -0
  22. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/__init__.py +0 -0
  23. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/_version.py +0 -0
  24. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/cli.py +0 -0
  25. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/core/__init__.py +0 -0
  26. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/core/lid_dataset.py +0 -0
  27. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/core/lid_model.py +0 -0
  28. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/core/registry.py +0 -0
  29. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/__init__.py +0 -0
  30. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/bibles.py +0 -0
  31. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/commonlid.py +0 -0
  32. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/flores_dev.py +0 -0
  33. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/nano.py +0 -0
  34. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/smolsent.py +0 -0
  35. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/social_media.py +0 -0
  36. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets/udhr.py +0 -0
  37. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets_tools/__init__.py +0 -0
  38. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets_tools/frequency_sample.py +0 -0
  39. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/datasets_tools/stratified_sample.py +0 -0
  40. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/evaluation/__init__.py +0 -0
  41. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/evaluation/cache.py +0 -0
  42. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/leaderboard/__init__.py +0 -0
  43. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/logging.py +0 -0
  44. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/metrics/__init__.py +0 -0
  45. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/metrics/aggregate.py +0 -0
  46. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/metrics/core.py +0 -0
  47. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/metrics/fpr.py +0 -0
  48. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/metrics/support_matrix.py +0 -0
  49. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/_fasttext_base.py +0 -0
  50. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/afrolid.py +0 -0
  51. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/cld2.py +0 -0
  52. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/cld3.py +0 -0
  53. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/dspy_llm.py +0 -0
  54. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/fasttext_ft.py +0 -0
  55. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/funlangid.py +0 -0
  56. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/glotlid.py +0 -0
  57. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/openlidv2.py +0 -0
  58. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/models/pyfranc.py +0 -0
  59. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/preprocess/__init__.py +0 -0
  60. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/preprocess/langcodes.py +0 -0
  61. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/preprocess/openlid_normer.py +0 -0
  62. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/py.typed +0 -0
  63. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/vendor/__init__.py +0 -0
  64. {commonlid-0.2.2 → commonlid-0.2.4}/src/commonlid/vendor/fun_langid.py +0 -0
  65. {commonlid-0.2.2 → commonlid-0.2.4}/tests/__init__.py +0 -0
  66. {commonlid-0.2.2 → commonlid-0.2.4}/tests/conftest.py +0 -0
  67. {commonlid-0.2.2 → commonlid-0.2.4}/tests/fixtures/preprocess_golden.jsonl +0 -0
  68. {commonlid-0.2.2 → commonlid-0.2.4}/tests/fixtures/tiny_lid.jsonl +0 -0
  69. {commonlid-0.2.2 → commonlid-0.2.4}/tests/fixtures/tiny_support_matrix.csv +0 -0
  70. {commonlid-0.2.2 → commonlid-0.2.4}/tests/hf-space/test_space_entrypoint.py +0 -0
  71. {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/__init__.py +0 -0
  72. {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_bibles_build_vs_cache.py +0 -0
  73. {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_cli_end_to_end.py +0 -0
  74. {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_cli_generate_support_matrix.py +0 -0
  75. {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_nano_build_vs_cache.py +0 -0
  76. {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_readme_examples.py +0 -0
  77. {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_smoke_parity.py +0 -0
  78. {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_smoke_parity_commonlid.py +0 -0
  79. {commonlid-0.2.2 → commonlid-0.2.4}/tests/integration/test_smolsent_build_vs_cache.py +0 -0
  80. {commonlid-0.2.2 → commonlid-0.2.4}/tests/legacy/__init__.py +0 -0
  81. {commonlid-0.2.2 → commonlid-0.2.4}/tests/legacy/langid_datasets.py +0 -0
  82. {commonlid-0.2.2 → commonlid-0.2.4}/tests/legacy/langid_models.py +0 -0
  83. {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/__init__.py +0 -0
  84. {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_cld2.py +0 -0
  85. {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_cld3.py +0 -0
  86. {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_discover_supported_languages.py +0 -0
  87. {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_dspy_llm.py +0 -0
  88. {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_fasttext_base.py +0 -0
  89. {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_funlangid.py +0 -0
  90. {commonlid-0.2.2 → commonlid-0.2.4}/tests/models/test_pyfranc.py +0 -0
  91. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/__init__.py +0 -0
  92. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_cache.py +0 -0
  93. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_cli_stub.py +0 -0
  94. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_dataset_build_from_source.py +0 -0
  95. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_dataset_registration.py +0 -0
  96. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_evaluator.py +0 -0
  97. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_frequency_sample.py +0 -0
  98. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_langcodes.py +0 -0
  99. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_lid_model_base.py +0 -0
  100. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_metrics_aggregate.py +0 -0
  101. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_metrics_core.py +0 -0
  102. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_metrics_fpr.py +0 -0
  103. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_nano_datasets.py +0 -0
  104. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_notebook_validity.py +0 -0
  105. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_preprocess.py +0 -0
  106. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_private_dataset_error.py +0 -0
  107. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_registry.py +0 -0
  108. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_stratified_sample.py +0 -0
  109. {commonlid-0.2.2 → commonlid-0.2.4}/tests/unit/test_support_matrix.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: commonlid
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Evaluate language identification models on CommonLID and other benchmarks.
5
5
  Project-URL: Homepage, https://huggingface.co/datasets/commoncrawl/CommonLID
6
6
  Project-URL: Paper, https://arxiv.org/abs/2601.18026
@@ -247,6 +247,8 @@ Requires-Dist: torch>=2.4; extra == 'all'
247
247
  Requires-Dist: transformers<5,>=4.46; extra == 'all'
248
248
  Provides-Extra: cld3
249
249
  Requires-Dist: cld3-py>=3.1; extra == 'cld3'
250
+ Provides-Extra: commonlingua
251
+ Requires-Dist: torch>=2.4; extra == 'commonlingua'
250
252
  Provides-Extra: dev
251
253
  Requires-Dist: azure-identity>=1.17; extra == 'dev'
252
254
  Requires-Dist: botocore>=1.35; extra == 'dev'
@@ -315,6 +317,7 @@ From PyPI:
315
317
  pip install commonlid # core deps + classical LID models
316
318
  pip install "commonlid[llm]" # + DSPy-based LLM evaluation
317
319
  pip install "commonlid[afrolid]" # + torch/transformers for AfroLID
320
+ pip install "commonlid[commonlingua]" # + torch for the CommonLingua byte-level model
318
321
  pip install "commonlid[notebooks]" # + jupyterlab + matplotlib for paper_tables.ipynb
319
322
  pip install "commonlid[all]" # everything runtime-facing
320
323
  ```
@@ -468,7 +471,7 @@ from commonlid import list_models, list_datasets
468
471
 
469
472
  assert list_models() == [
470
473
  "AfroLID", "GlotLID", "OpenLID-v2", "cld2", "cld3",
471
- "fasttext", "funlangid", "pyfranc",
474
+ "commonlingua", "fasttext", "funlangid", "pyfranc",
472
475
  ]
473
476
  assert list_datasets() == [
474
477
  "bibles_300", "bibles_300_nano",
@@ -574,6 +577,7 @@ for line in preds_path.read_text().splitlines():
574
577
  | `fasttext` | [facebook/fasttext-language-identification](https://huggingface.co/facebook/fasttext-language-identification) | fasttext |
575
578
  | `pyfranc` | [pyfranc](https://pypi.org/project/pyfranc/) | Pure Python |
576
579
  | `AfroLID` | [UBC-NLP/afrolid_1.5](https://huggingface.co/UBC-NLP/afrolid_1.5) | Requires `[afrolid]` extra |
580
+ | `commonlingua` | [PleIAs/CommonLingua](https://huggingface.co/PleIAs/CommonLingua) | 2.35M-param byte-level model, 334 languages; requires `[commonlingua]` extra |
577
581
  | `funlangid` | Vendored in `src/commonlid/vendor/fun_langid.py` | Simple char-4gram baseline |
578
582
 
579
583
  LLM models are instantiated dynamically (`DSPyLLMModel`) and not
@@ -704,6 +708,8 @@ exclude the `und` bucket by default (toggle with `include_und=True`).
704
708
 
705
709
  ## Adding a new model
706
710
 
711
+ A guide for adding a new model can be found [here](docs/contributing/adding_a_model.md).
712
+
707
713
  <!-- readme-test: fast; id=add-model (registers into an isolated registry) -->
708
714
  ```python
709
715
  # src/commonlid/models/my_model.py
@@ -39,6 +39,7 @@ From PyPI:
39
39
  pip install commonlid # core deps + classical LID models
40
40
  pip install "commonlid[llm]" # + DSPy-based LLM evaluation
41
41
  pip install "commonlid[afrolid]" # + torch/transformers for AfroLID
42
+ pip install "commonlid[commonlingua]" # + torch for the CommonLingua byte-level model
42
43
  pip install "commonlid[notebooks]" # + jupyterlab + matplotlib for paper_tables.ipynb
43
44
  pip install "commonlid[all]" # everything runtime-facing
44
45
  ```
@@ -192,7 +193,7 @@ from commonlid import list_models, list_datasets
192
193
 
193
194
  assert list_models() == [
194
195
  "AfroLID", "GlotLID", "OpenLID-v2", "cld2", "cld3",
195
- "fasttext", "funlangid", "pyfranc",
196
+ "commonlingua", "fasttext", "funlangid", "pyfranc",
196
197
  ]
197
198
  assert list_datasets() == [
198
199
  "bibles_300", "bibles_300_nano",
@@ -298,6 +299,7 @@ for line in preds_path.read_text().splitlines():
298
299
  | `fasttext` | [facebook/fasttext-language-identification](https://huggingface.co/facebook/fasttext-language-identification) | fasttext |
299
300
  | `pyfranc` | [pyfranc](https://pypi.org/project/pyfranc/) | Pure Python |
300
301
  | `AfroLID` | [UBC-NLP/afrolid_1.5](https://huggingface.co/UBC-NLP/afrolid_1.5) | Requires `[afrolid]` extra |
302
+ | `commonlingua` | [PleIAs/CommonLingua](https://huggingface.co/PleIAs/CommonLingua) | 2.35M-param byte-level model, 334 languages; requires `[commonlingua]` extra |
301
303
  | `funlangid` | Vendored in `src/commonlid/vendor/fun_langid.py` | Simple char-4gram baseline |
302
304
 
303
305
  LLM models are instantiated dynamically (`DSPyLLMModel`) and not
@@ -428,6 +430,8 @@ exclude the `und` bucket by default (toggle with `include_und=True`).
428
430
 
429
431
  ## Adding a new model
430
432
 
433
+ A guide for adding a new model can be found [here](docs/contributing/adding_a_model.md).
434
+
431
435
  <!-- readme-test: fast; id=add-model (registers into an isolated registry) -->
432
436
  ```python
433
437
  # src/commonlid/models/my_model.py
@@ -0,0 +1,84 @@
1
+ # Adding a model to the leaderboard
2
+
3
+ The CommonLID leaderboard is available [here](https://huggingface.co/spaces/commoncrawl/commonlid).
4
+
5
+ 1. Add the [model implementation](#adding-a-model-implementation) to `commonlid`
6
+ 2. [Evaluate](#evaluate-new-model) the desired model using `commonlid` on the benchmarks
7
+ 3. Push the results to the [results repository](https://huggingface.co/datasets/commoncrawl/commonlid-results) via a PR. Once merged they will appear on the leaderboard.
8
+
9
+ ## Requesting an evaluation
10
+
11
+ If you want a model to be evaluated but are not submitting the results yourself, open an issue instead and provide the required information.
12
+
13
+ ## Adding a model implementation
14
+
15
+ Adding a model implementation to `commonlid` is quite straightforward. Typically, it only requires that you provide the text-to-language prediction method and add it to the [model directory](https://github.com/commoncrawl/commonlid-eval/tree/main/src/commonlid/models):
16
+
17
+ ```python
18
+ # src/commonlid/models/my_model.py
19
+ from collections.abc import Sequence
20
+
21
+ from commonlid.core.lid_model import LIDModel
22
+ from commonlid.core.registry import get_model, register_model
23
+
24
+
25
+ @register_model
26
+ class MyModel(LIDModel):
27
+ model_id = "my_model"
28
+
29
+ def _predict_batch(self, texts: Sequence[str]) -> list[str | None]:
30
+ # Return one ISO 639-3 code (or None for undetermined) per input.
31
+ # `texts` arrives post-OpenLID-normer cleaning by default;
32
+ # set `requires_preprocessing = False` to receive raw text.
33
+ return ["eng"] * len(texts)
34
+
35
+
36
+ assert get_model("my_model").predict(["hi"]) == ["eng"]
37
+ ```
38
+
39
+ Then import it from `src/commonlid/models/__init__.py` so the decorator
40
+ fires on `import commonlid`:
41
+
42
+ ```python
43
+ from commonlid.models import my_model as _my_model # noqa: F401
44
+ ```
45
+
46
+
47
+ ### Adding model dependencies
48
+
49
+ If you are adding a model that requires additional dependencies, you can add them to the `pyproject.toml` file, under optional dependencies:
50
+
51
+ ```toml
52
+ cld3 = ["cld3-py>=3.1"]
53
+ ```
54
+
55
+ This ensures that the implementation does not break if a package is updated.
56
+
57
+ As it is an optional dependency, you can't use top-level dependencies, but will instead have to use import inside the wrapper scope.
58
+
59
+ ## Evaluate new model
60
+
61
+ As soon as the model implementation is registered, you can run this command to evaluate your model on CommonLID and its nano version:
62
+
63
+ ```bash
64
+ commonlid run \
65
+ --model my_model \
66
+ --dataset commonlid --dataset commonlid_nano \
67
+ --output-dir ./data/results
68
+ ```
69
+
70
+ You may indeed reinstall the `commonlid` package with your changes if the package was not installed in editable mode.
71
+
72
+ ## Uploading the results data (PR-based)
73
+
74
+ After running the evaluation locally, you can upload the results to our [HF results repository](https://huggingface.co/datasets/commoncrawl/commonlid-results) as follows:
75
+
76
+ ```bash
77
+ hf auth login # token with write access to the results dataset
78
+ make leaderboard-upload # opens a Pull Request from ./data/results
79
+ # Override the target with: make leaderboard-upload LEADERBOARD_REPO=other/repo LEADERBOARD_DIR=./elsewhere
80
+ # Optional: pass --skip-predictions via `uv run commonlid leaderboard upload ...` directly.
81
+ ```
82
+
83
+ The CLI always opens a Pull Request rather than pushing to the default
84
+ branch, so the dataset owner reviews before merging.
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "commonlid"
7
- version = "0.2.2"
7
+ version = "0.2.4"
8
8
  description = "Evaluate language identification models on CommonLID and other benchmarks."
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -59,6 +59,11 @@ llm = [
59
59
  "botocore>=1.35",
60
60
  ]
61
61
  cld3 = ["cld3-py>=3.1"]
62
+ commonlingua = [
63
+ # CommonLingua is a 2.35M-param byte-level model; needs torch but not the
64
+ # transformers stack that [afrolid] pulls in.
65
+ "torch>=2.4",
66
+ ]
62
67
  leaderboard = [
63
68
  # gradio 4.x imports HfFolder from huggingface_hub, which was removed in
64
69
  # huggingface-hub 1.0; gradio 5 dropped that import.
@@ -88,7 +93,7 @@ notebooks = [
88
93
  "nbclient>=0.10",
89
94
  ]
90
95
  all = [
91
- "commonlid[afrolid,llm]",
96
+ "commonlid[afrolid,llm,commonlingua]",
92
97
  ]
93
98
 
94
99
  [project.scripts]
@@ -208,6 +213,8 @@ omit = [
208
213
  # afrolid needs the heavy `[afrolid]` extra (torch + transformers); not
209
214
  # installed in dev and so exercised only via mocked unit tests.
210
215
  "src/commonlid/models/afrolid.py",
216
+ # commonlingua needs the `[commonlingua]` extra (torch); same precedent.
217
+ "src/commonlid/models/commonlingua.py",
211
218
  ]
212
219
 
213
220
  [tool.coverage.report]
@@ -159,6 +159,21 @@ class Evaluator:
159
159
  )
160
160
  n_with_gold = sum(1 for g in ytrue if g is not None)
161
161
  samples_per_second = (len(ytrue) / elapsed) if elapsed > 0 else 0.0
162
+ # `None` here is meaningful: it tells downstream consumers that the
163
+ # model's support set is undefined (e.g. LLMs), distinct from a model
164
+ # that declared an empty set. Errors during discovery downgrade to
165
+ # the same "unknown" sentinel rather than crashing the run.
166
+ try:
167
+ supported = model.discover_supported_languages()
168
+ except Exception as exc:
169
+ logger.warning(
170
+ "%s discover_supported_languages() raised %s: %s -- recording as None",
171
+ prefix,
172
+ type(exc).__name__,
173
+ exc,
174
+ )
175
+ supported = None
176
+ supported_languages = sorted(supported) if supported is not None else None
162
177
  result = Result(
163
178
  model_id=model.model_id,
164
179
  dataset_id=dataset.dataset_id,
@@ -170,6 +185,7 @@ class Evaluator:
170
185
  limit=self.config.limit,
171
186
  timestamp=datetime.now(timezone.utc).isoformat(),
172
187
  commonlid_version=__version__,
188
+ supported_languages=supported_languages,
173
189
  )
174
190
 
175
191
  run_dir = self.config.output_dir / dataset.dataset_id / model.model_id
@@ -13,12 +13,20 @@ from typing import Any
13
13
  from commonlid.metrics.aggregate import macro_average, micro_average
14
14
  from commonlid.metrics.core import LanguageMetrics
15
15
 
16
- SCHEMA_VERSION = 2
16
+ SCHEMA_VERSION = 3
17
17
 
18
18
 
19
19
  @dataclass(slots=True)
20
20
  class Result:
21
- """Aggregate outcome of one model evaluated on one dataset."""
21
+ """Aggregate outcome of one model evaluated on one dataset.
22
+
23
+ ``supported_languages`` follows a tri-state convention shared with
24
+ :meth:`LIDModel.discover_supported_languages`: ``None`` means the
25
+ model's support set is undefined (e.g. LLM-based models that can be
26
+ prompted for any language), a list of ISO 639-3 codes is the closed
27
+ set the model declares, and an empty list is the degenerate "supports
28
+ zero languages" case. The leaderboard's ``(cov.)`` view consumes this.
29
+ """
22
30
 
23
31
  model_id: str
24
32
  dataset_id: str
@@ -32,6 +40,7 @@ class Result:
32
40
  commonlid_version: str = ""
33
41
  python_version: str = field(default_factory=lambda: sys.version.split()[0])
34
42
  platform: str = field(default_factory=platform.platform)
43
+ supported_languages: list[str] | None = None
35
44
  extra: dict[str, Any] = field(default_factory=dict)
36
45
 
37
46
  def summary(self) -> dict[str, Any]:
@@ -52,6 +61,7 @@ class Result:
52
61
  "macro": macro_average(self.per_language),
53
62
  "micro": micro_average(self.per_language),
54
63
  "per_language": {lang: asdict(m) for lang, m in sorted(self.per_language.items())},
64
+ "supported_languages": self.supported_languages,
55
65
  "extra": self.extra,
56
66
  }
57
67
 
@@ -42,6 +42,26 @@ BLOG_URL = (
42
42
  )
43
43
  PAPER_URL = "https://arxiv.org/abs/2601.18026"
44
44
 
45
+ WEBSITE_URL = "https://commonlid.org/"
46
+
47
+ NEW_MODEL_URL = (
48
+ "https://github.com/commoncrawl/commonlid-eval/blob/main/docs/contributing/adding_a_model.md"
49
+ )
50
+
51
+ Scope = Literal["all", "cov"]
52
+
53
+ #: Radio choices shown above each dataset's results table.
54
+ SCOPE_CHOICES: list[tuple[str, Scope]] = [
55
+ ("Scores are calculated over the whole dataset.", "all"),
56
+ (
57
+ "Scores are calculated on the subset of language varieties covered by the model. (cov.)",
58
+ "cov",
59
+ ),
60
+ ]
61
+
62
+ #: Sentinel string used when a row has no cov data (rendered as em-dash).
63
+ _NA_DISPLAY = "—"
64
+
45
65
  #: Display columns in the headline table (in order). Macro F1 is the headline metric.
46
66
  _HEADLINE_COLUMNS: list[tuple[str, str]] = [
47
67
  ("model_id", "Model"),
@@ -51,6 +71,19 @@ _HEADLINE_COLUMNS: list[tuple[str, str]] = [
51
71
  ("n_languages", "Languages"),
52
72
  ("samples_per_second", "Samples/s"),
53
73
  ]
74
+
75
+ #: Same columns, projected from the ``*_cov`` source fields. Display
76
+ #: labels stay identical so the table layout doesn't shift when the
77
+ #: scope radio is toggled.
78
+ _HEADLINE_COLUMNS_COV: list[tuple[str, str]] = [
79
+ ("model_id", "Model"),
80
+ ("macro_f1_cov", "Macro F1"),
81
+ ("micro_f1_cov", "Micro F1"),
82
+ ("mean_fpr_cov", "Mean FPR (%)"),
83
+ ("n_languages_cov", "Languages"),
84
+ ("samples_per_second", "Samples/s"),
85
+ ]
86
+
54
87
  #: Right-aligned numeric columns get the ``number`` Gradio datatype which
55
88
  #: pushes values to the right edge of the cell.
56
89
  _GradioDtype = Literal["str", "number", "bool", "date", "markdown", "html"]
@@ -134,6 +167,46 @@ _DRILLDOWN_COLUMN_HELP: list[tuple[str, str]] = [
134
167
  ]
135
168
 
136
169
 
170
+ #: Per-column human descriptions for the **(cov.)** view — same metrics,
171
+ #: but restricted to the model's declared support set.
172
+ _HEADLINE_COLUMN_HELP_COV: list[tuple[str, str]] = [
173
+ ("Model", "Identifier of the language identification model."),
174
+ (
175
+ "Macro F1",
176
+ "Unweighted mean of per-language F1 (x100) **restricted to languages the "
177
+ "model declares it supports** (paper `(cov.)` definition). Languages outside "
178
+ "the model's support set are excluded from the average — a model that covers "
179
+ "a small but accurate subset of the benchmark is no longer penalised for the "
180
+ "long tail of languages it never claimed to handle. **Higher is better.** "
181
+ f"Models without a declared support set show `{_NA_DISPLAY}`.",
182
+ ),
183
+ (
184
+ "Micro F1",
185
+ "Sample-weighted F1 (x100) pooled over the **model-supported subset** of "
186
+ "gold samples only. **Higher is better.** "
187
+ f"`{_NA_DISPLAY}` when no support set is declared.",
188
+ ),
189
+ (
190
+ "Mean FPR (%)",
191
+ "Mean per-language false-positive rate computed only on samples whose gold "
192
+ "language is in the model's support set; TN counts confusion across other "
193
+ "supported languages, not the long tail. **Lower is better.** "
194
+ f"`{_NA_DISPLAY}` when no support set is declared.",
195
+ ),
196
+ (
197
+ "Languages",
198
+ "Number of model-supported languages that have at least one gold sample in "
199
+ "this dataset (`|supported ∩ gold|`). This is the size of the slice every "
200
+ "other `(cov.)` metric is averaged over.",
201
+ ),
202
+ (
203
+ "Samples/s",
204
+ "Throughput during evaluation (samples processed per second). Unaffected by "
205
+ "the scope toggle — it is a model-property, not a metric.",
206
+ ),
207
+ ]
208
+
209
+
137
210
  def _columns_help_markdown(items: list[tuple[str, str]]) -> str:
138
211
  """Render a (column, description) list as a Markdown bullet block."""
139
212
  return "\n".join(f"- **{label}** — {desc}" for label, desc in items)
@@ -157,30 +230,55 @@ def _styled_value(table: Any, right_align_after_col: int = 0) -> dict[str, Any]:
157
230
  return {"data": data, "headers": headers, "metadata": {"styling": styling}}
158
231
 
159
232
 
160
- def _format_table(df: Any) -> Any:
233
+ def _fmt(value: Any, decimals: int, *, scale: float = 1.0) -> str:
234
+ """Format a numeric value with ``decimals`` precision, em-dash for ``None``/``NaN``."""
235
+ import pandas as pd
236
+
237
+ if value is None or (isinstance(value, float) and pd.isna(value)):
238
+ return _NA_DISPLAY
239
+ return f"{float(value) * scale:.{decimals}f}"
240
+
241
+
242
+ def _format_table(df: Any, scope: Scope = "all") -> Any:
161
243
  """Project + format a results DataFrame for one Gradio tab.
162
244
 
163
245
  Numeric columns are converted to **fixed-decimal strings** (e.g. ``0.00``
164
246
  not ``0``) so the rendered cells line up vertically; sort ordering is
165
- preserved by sorting on the raw ``macro_f1`` *before* formatting.
247
+ preserved by sorting on the raw float *before* formatting.
166
248
 
167
249
  - Macro F1 / Micro F1 / Samples/s use **1 decimal**.
168
250
  - Mean FPR (%) uses **2 decimals**.
251
+ - In ``scope="cov"``, rows without ``supported_languages`` data render
252
+ em-dashes for every cov metric and sort to the bottom.
169
253
  """
170
254
  import pandas as pd
171
255
 
256
+ columns = _HEADLINE_COLUMNS_COV if scope == "cov" else _HEADLINE_COLUMNS
257
+ display_labels = [label for _, label in columns]
172
258
  if df.empty:
173
- return pd.DataFrame(columns=[label for _, label in _HEADLINE_COLUMNS])
259
+ return pd.DataFrame(columns=display_labels)
260
+
174
261
  out = df.copy()
175
- # Sort on the raw float so the resulting order is correct; format only
176
- # afterwards (string sort would order "10" before "9").
177
- out = out.sort_values("macro_f1", ascending=False, kind="stable").reset_index(drop=True)
178
- out["macro_f1"] = (out["macro_f1"] * 100).map(lambda x: f"{x:.1f}")
179
- out["micro_f1"] = (out["micro_f1"] * 100).map(lambda x: f"{x:.1f}")
180
- out["mean_fpr"] = (out["mean_fpr"] * 100).map(lambda x: f"{x:.2f}")
181
- out["samples_per_second"] = out["samples_per_second"].map(lambda x: f"{x:.1f}")
182
- out = out[[k for k, _ in _HEADLINE_COLUMNS]]
183
- out.columns = [label for _, label in _HEADLINE_COLUMNS]
262
+ source = {key: key for key, _ in columns}
263
+ sort_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"]
264
+ # ``na_position="last"`` sinks rows without cov data to the bottom of
265
+ # the (cov.) view; the "all" view has no NaNs in this column.
266
+ out = out.sort_values(sort_key, ascending=False, kind="stable", na_position="last")
267
+ out = out.reset_index(drop=True)
268
+
269
+ macro_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"]
270
+ micro_key = source["micro_f1_cov"] if scope == "cov" else source["micro_f1"]
271
+ fpr_key = source["mean_fpr_cov"] if scope == "cov" else source["mean_fpr"]
272
+ langs_key = source["n_languages_cov"] if scope == "cov" else source["n_languages"]
273
+
274
+ out[macro_key] = out[macro_key].map(lambda x: _fmt(x, 1, scale=100))
275
+ out[micro_key] = out[micro_key].map(lambda x: _fmt(x, 1, scale=100))
276
+ out[fpr_key] = out[fpr_key].map(lambda x: _fmt(x, 2, scale=100))
277
+ out[langs_key] = out[langs_key].map(lambda x: _fmt(x, 0))
278
+ out["samples_per_second"] = out["samples_per_second"].map(lambda x: _fmt(x, 1))
279
+
280
+ out = out[[k for k, _ in columns]]
281
+ out.columns = display_labels
184
282
  return out
185
283
 
186
284
 
@@ -314,23 +412,29 @@ def _format_license(license_name: str, license_url: str | None) -> str:
314
412
 
315
413
  def _make_select_handler(
316
414
  dataset_id: str,
317
- table: Any,
318
415
  snapshot_root: Path,
319
416
  ) -> Any:
320
417
  """Build the row-select callback as a closure over the captured state.
321
418
 
419
+ Uses ``gr.SelectData.row_value`` (Gradio's per-click payload that
420
+ contains the clicked row as a 1-D list) so the drilldown picks up the
421
+ *current* table ordering — switching the scope radio and then clicking
422
+ a row resolves to the row at its post-toggle position. Passing the
423
+ Dataframe component as an event input would not work: Gradio 6
424
+ preprocesses Dataframe inputs into ``pandas.DataFrame`` objects, not
425
+ the ``{"data", "headers"}`` dict we feed in via ``_styled_value``.
426
+
322
427
  Gradio inspects ``__defaults__`` when registering events, and comparing a
323
428
  DataFrame default against a type annotation hits an unimplemented arrow
324
429
  dtype path. A closure keeps the state out of the function signature.
325
430
  """
326
431
 
327
432
  def _on_select(evt: gr.SelectData) -> tuple[str, Any]:
328
- if evt.index is None:
433
+ if evt.index is None or not evt.row_value:
329
434
  return ("_Click a row to load per-language metrics._", None)
330
- row_idx = evt.index[0] if isinstance(evt.index, list | tuple) else evt.index
331
435
  try:
332
- model_id = table.iloc[row_idx]["Model"]
333
- except (IndexError, KeyError):
436
+ model_id = evt.row_value[0]
437
+ except (IndexError, TypeError):
334
438
  return ("_Could not resolve clicked row._", None)
335
439
  per_lang = _per_language_drilldown(snapshot_root, dataset_id, model_id)
336
440
  return (
@@ -341,6 +445,19 @@ def _make_select_handler(
341
445
  return _on_select
342
446
 
343
447
 
448
+ def _make_scope_handler(sub_df: Any) -> Any:
449
+ """Build the scope-radio change callback: swap the table data + legend in lockstep."""
450
+
451
+ def _on_change(scope: Scope) -> tuple[Any, str]:
452
+ help_items = _HEADLINE_COLUMN_HELP_COV if scope == "cov" else _HEADLINE_COLUMN_HELP
453
+ return (
454
+ _styled_value(_format_table(sub_df, scope=scope)),
455
+ _columns_help_markdown(help_items),
456
+ )
457
+
458
+ return _on_change
459
+
460
+
344
461
  def build_app(
345
462
  *,
346
463
  repo_id: str = DEFAULT_REPO_ID,
@@ -369,7 +486,7 @@ def build_app(
369
486
  f"Headline metric: **macro F1**. Models are ranked by macro F1 "
370
487
  f"within each tab; click a row to see per-language metrics.\n"
371
488
  f"\n"
372
- f"📝 [Blog post]({BLOG_URL}) • 📄 [Paper]({PAPER_URL})"
489
+ f"🌐 [Website]({WEBSITE_URL}) • 📝 [Blog post]({BLOG_URL}) • 📄 [Paper]({PAPER_URL}) • 🆕 [Add a model]({NEW_MODEL_URL})"
373
490
  )
374
491
  repo_url = f"https://huggingface.co/datasets/{repo_id}"
375
492
  if revision:
@@ -384,7 +501,7 @@ def build_app(
384
501
  with gr.Tab(label=tab_label):
385
502
  gr.Markdown(_dataset_metadata_markdown(dataset_id))
386
503
  sub = df[df["dataset_id"] == dataset_id]
387
- table = _format_table(sub)
504
+ table = _format_table(sub, scope="all")
388
505
  if table.empty:
389
506
  gr.Markdown(
390
507
  f"_No results for `{dataset_id}` in `{repo_id}` yet."
@@ -394,6 +511,12 @@ def build_app(
394
511
  )
395
512
  continue
396
513
 
514
+ scope_radio = gr.Radio(
515
+ choices=SCOPE_CHOICES,
516
+ value="all",
517
+ label="Scoring scope",
518
+ interactive=True,
519
+ )
397
520
  leaderboard = gr.Dataframe(
398
521
  value=_styled_value(table),
399
522
  datatype=_HEADLINE_DATATYPES,
@@ -402,7 +525,7 @@ def build_app(
402
525
  label=f"{dataset_id} — sorted by Macro F1",
403
526
  )
404
527
  with gr.Accordion("What do these columns mean?", open=False):
405
- gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP))
528
+ legend = gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP))
406
529
  drilldown_label = gr.Markdown("_Click a row to load per-language metrics._")
407
530
  # Seed the drilldown grid with an empty DataFrame so the Component
408
531
  # has stable column headers before the first row click.
@@ -415,8 +538,13 @@ def build_app(
415
538
  with gr.Accordion("What do these per-language columns mean?", open=False):
416
539
  gr.Markdown(_columns_help_markdown(_DRILLDOWN_COLUMN_HELP))
417
540
 
541
+ scope_radio.change(
542
+ _make_scope_handler(sub),
543
+ inputs=[scope_radio],
544
+ outputs=[leaderboard, legend],
545
+ )
418
546
  leaderboard.select(
419
- _make_select_handler(dataset_id, table, snapshot_root),
547
+ _make_select_handler(dataset_id, snapshot_root),
420
548
  outputs=[drilldown_label, drilldown],
421
549
  )
422
550
  gr.Markdown(footer)