commonlid 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. {commonlid-0.2.2 → commonlid-0.2.3}/PKG-INFO +6 -2
  2. {commonlid-0.2.2 → commonlid-0.2.3}/README.md +3 -1
  3. {commonlid-0.2.2 → commonlid-0.2.3}/pyproject.toml +9 -2
  4. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/evaluation/evaluator.py +16 -0
  5. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/evaluation/results.py +12 -2
  6. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/leaderboard/app.py +143 -19
  7. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/leaderboard/data.py +125 -2
  8. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/__init__.py +1 -0
  9. commonlid-0.2.3/src/commonlid/models/commonlingua.py +115 -0
  10. commonlid-0.2.3/src/commonlid/vendor/commonlingua/__init__.py +4 -0
  11. commonlid-0.2.3/src/commonlid/vendor/commonlingua/model.py +186 -0
  12. commonlid-0.2.3/tests/models/test_commonlingua.py +95 -0
  13. {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_model_registration.py +1 -0
  14. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_leaderboard_data.py +182 -3
  15. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_results_io.py +30 -0
  16. {commonlid-0.2.2 → commonlid-0.2.3}/.gitignore +0 -0
  17. {commonlid-0.2.2 → commonlid-0.2.3}/LICENSE +0 -0
  18. {commonlid-0.2.2 → commonlid-0.2.3}/docs/architecture.md +0 -0
  19. {commonlid-0.2.2 → commonlid-0.2.3}/hf-space/README.md +0 -0
  20. {commonlid-0.2.2 → commonlid-0.2.3}/notebooks/README.md +0 -0
  21. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/__init__.py +0 -0
  22. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/_version.py +0 -0
  23. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/cli.py +0 -0
  24. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/core/__init__.py +0 -0
  25. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/core/lid_dataset.py +0 -0
  26. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/core/lid_model.py +0 -0
  27. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/core/registry.py +0 -0
  28. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/__init__.py +0 -0
  29. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/bibles.py +0 -0
  30. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/commonlid.py +0 -0
  31. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/flores_dev.py +0 -0
  32. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/nano.py +0 -0
  33. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/smolsent.py +0 -0
  34. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/social_media.py +0 -0
  35. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets/udhr.py +0 -0
  36. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets_tools/__init__.py +0 -0
  37. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets_tools/frequency_sample.py +0 -0
  38. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/datasets_tools/stratified_sample.py +0 -0
  39. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/evaluation/__init__.py +0 -0
  40. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/evaluation/cache.py +0 -0
  41. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/leaderboard/__init__.py +0 -0
  42. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/logging.py +0 -0
  43. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/metrics/__init__.py +0 -0
  44. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/metrics/aggregate.py +0 -0
  45. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/metrics/core.py +0 -0
  46. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/metrics/fpr.py +0 -0
  47. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/metrics/support_matrix.py +0 -0
  48. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/_fasttext_base.py +0 -0
  49. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/afrolid.py +0 -0
  50. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/cld2.py +0 -0
  51. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/cld3.py +0 -0
  52. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/dspy_llm.py +0 -0
  53. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/fasttext_ft.py +0 -0
  54. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/funlangid.py +0 -0
  55. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/glotlid.py +0 -0
  56. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/openlidv2.py +0 -0
  57. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/pyfranc.py +0 -0
  58. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/preprocess/__init__.py +0 -0
  59. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/preprocess/langcodes.py +0 -0
  60. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/preprocess/openlid_normer.py +0 -0
  61. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/py.typed +0 -0
  62. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/vendor/__init__.py +0 -0
  63. {commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/vendor/fun_langid.py +0 -0
  64. {commonlid-0.2.2 → commonlid-0.2.3}/tests/__init__.py +0 -0
  65. {commonlid-0.2.2 → commonlid-0.2.3}/tests/conftest.py +0 -0
  66. {commonlid-0.2.2 → commonlid-0.2.3}/tests/fixtures/preprocess_golden.jsonl +0 -0
  67. {commonlid-0.2.2 → commonlid-0.2.3}/tests/fixtures/tiny_lid.jsonl +0 -0
  68. {commonlid-0.2.2 → commonlid-0.2.3}/tests/fixtures/tiny_support_matrix.csv +0 -0
  69. {commonlid-0.2.2 → commonlid-0.2.3}/tests/hf-space/test_space_entrypoint.py +0 -0
  70. {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/__init__.py +0 -0
  71. {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_bibles_build_vs_cache.py +0 -0
  72. {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_cli_end_to_end.py +0 -0
  73. {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_cli_generate_support_matrix.py +0 -0
  74. {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_nano_build_vs_cache.py +0 -0
  75. {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_readme_examples.py +0 -0
  76. {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_smoke_parity.py +0 -0
  77. {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_smoke_parity_commonlid.py +0 -0
  78. {commonlid-0.2.2 → commonlid-0.2.3}/tests/integration/test_smolsent_build_vs_cache.py +0 -0
  79. {commonlid-0.2.2 → commonlid-0.2.3}/tests/legacy/__init__.py +0 -0
  80. {commonlid-0.2.2 → commonlid-0.2.3}/tests/legacy/langid_datasets.py +0 -0
  81. {commonlid-0.2.2 → commonlid-0.2.3}/tests/legacy/langid_models.py +0 -0
  82. {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/__init__.py +0 -0
  83. {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_cld2.py +0 -0
  84. {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_cld3.py +0 -0
  85. {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_discover_supported_languages.py +0 -0
  86. {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_dspy_llm.py +0 -0
  87. {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_fasttext_base.py +0 -0
  88. {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_funlangid.py +0 -0
  89. {commonlid-0.2.2 → commonlid-0.2.3}/tests/models/test_pyfranc.py +0 -0
  90. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/__init__.py +0 -0
  91. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_cache.py +0 -0
  92. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_cli_stub.py +0 -0
  93. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_dataset_build_from_source.py +0 -0
  94. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_dataset_registration.py +0 -0
  95. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_evaluator.py +0 -0
  96. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_frequency_sample.py +0 -0
  97. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_langcodes.py +0 -0
  98. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_lid_model_base.py +0 -0
  99. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_metrics_aggregate.py +0 -0
  100. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_metrics_core.py +0 -0
  101. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_metrics_fpr.py +0 -0
  102. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_nano_datasets.py +0 -0
  103. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_notebook_validity.py +0 -0
  104. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_preprocess.py +0 -0
  105. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_private_dataset_error.py +0 -0
  106. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_registry.py +0 -0
  107. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_stratified_sample.py +0 -0
  108. {commonlid-0.2.2 → commonlid-0.2.3}/tests/unit/test_support_matrix.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: commonlid
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Evaluate language identification models on CommonLID and other benchmarks.
5
5
  Project-URL: Homepage, https://huggingface.co/datasets/commoncrawl/CommonLID
6
6
  Project-URL: Paper, https://arxiv.org/abs/2601.18026
@@ -247,6 +247,8 @@ Requires-Dist: torch>=2.4; extra == 'all'
247
247
  Requires-Dist: transformers<5,>=4.46; extra == 'all'
248
248
  Provides-Extra: cld3
249
249
  Requires-Dist: cld3-py>=3.1; extra == 'cld3'
250
+ Provides-Extra: commonlingua
251
+ Requires-Dist: torch>=2.4; extra == 'commonlingua'
250
252
  Provides-Extra: dev
251
253
  Requires-Dist: azure-identity>=1.17; extra == 'dev'
252
254
  Requires-Dist: botocore>=1.35; extra == 'dev'
@@ -315,6 +317,7 @@ From PyPI:
315
317
  pip install commonlid # core deps + classical LID models
316
318
  pip install "commonlid[llm]" # + DSPy-based LLM evaluation
317
319
  pip install "commonlid[afrolid]" # + torch/transformers for AfroLID
320
+ pip install "commonlid[commonlingua]" # + torch for the CommonLingua byte-level model
318
321
  pip install "commonlid[notebooks]" # + jupyterlab + matplotlib for paper_tables.ipynb
319
322
  pip install "commonlid[all]" # everything runtime-facing
320
323
  ```
@@ -468,7 +471,7 @@ from commonlid import list_models, list_datasets
468
471
 
469
472
  assert list_models() == [
470
473
  "AfroLID", "GlotLID", "OpenLID-v2", "cld2", "cld3",
471
- "fasttext", "funlangid", "pyfranc",
474
+ "commonlingua", "fasttext", "funlangid", "pyfranc",
472
475
  ]
473
476
  assert list_datasets() == [
474
477
  "bibles_300", "bibles_300_nano",
@@ -574,6 +577,7 @@ for line in preds_path.read_text().splitlines():
574
577
  | `fasttext` | [facebook/fasttext-language-identification](https://huggingface.co/facebook/fasttext-language-identification) | fasttext |
575
578
  | `pyfranc` | [pyfranc](https://pypi.org/project/pyfranc/) | Pure Python |
576
579
  | `AfroLID` | [UBC-NLP/afrolid_1.5](https://huggingface.co/UBC-NLP/afrolid_1.5) | Requires `[afrolid]` extra |
580
+ | `commonlingua` | [PleIAs/CommonLingua](https://huggingface.co/PleIAs/CommonLingua) | 2.35M-param byte-level model, 334 languages; requires `[commonlingua]` extra |
577
581
  | `funlangid` | Vendored in `src/commonlid/vendor/fun_langid.py` | Simple char-4gram baseline |
578
582
 
579
583
  LLM models are instantiated dynamically (`DSPyLLMModel`) and not
@@ -39,6 +39,7 @@ From PyPI:
39
39
  pip install commonlid # core deps + classical LID models
40
40
  pip install "commonlid[llm]" # + DSPy-based LLM evaluation
41
41
  pip install "commonlid[afrolid]" # + torch/transformers for AfroLID
42
+ pip install "commonlid[commonlingua]" # + torch for the CommonLingua byte-level model
42
43
  pip install "commonlid[notebooks]" # + jupyterlab + matplotlib for paper_tables.ipynb
43
44
  pip install "commonlid[all]" # everything runtime-facing
44
45
  ```
@@ -192,7 +193,7 @@ from commonlid import list_models, list_datasets
192
193
 
193
194
  assert list_models() == [
194
195
  "AfroLID", "GlotLID", "OpenLID-v2", "cld2", "cld3",
195
- "fasttext", "funlangid", "pyfranc",
196
+ "commonlingua", "fasttext", "funlangid", "pyfranc",
196
197
  ]
197
198
  assert list_datasets() == [
198
199
  "bibles_300", "bibles_300_nano",
@@ -298,6 +299,7 @@ for line in preds_path.read_text().splitlines():
298
299
  | `fasttext` | [facebook/fasttext-language-identification](https://huggingface.co/facebook/fasttext-language-identification) | fasttext |
299
300
  | `pyfranc` | [pyfranc](https://pypi.org/project/pyfranc/) | Pure Python |
300
301
  | `AfroLID` | [UBC-NLP/afrolid_1.5](https://huggingface.co/UBC-NLP/afrolid_1.5) | Requires `[afrolid]` extra |
302
+ | `commonlingua` | [PleIAs/CommonLingua](https://huggingface.co/PleIAs/CommonLingua) | 2.35M-param byte-level model, 334 languages; requires `[commonlingua]` extra |
301
303
  | `funlangid` | Vendored in `src/commonlid/vendor/fun_langid.py` | Simple char-4gram baseline |
302
304
 
303
305
  LLM models are instantiated dynamically (`DSPyLLMModel`) and not
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "commonlid"
7
- version = "0.2.2"
7
+ version = "0.2.3"
8
8
  description = "Evaluate language identification models on CommonLID and other benchmarks."
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -59,6 +59,11 @@ llm = [
59
59
  "botocore>=1.35",
60
60
  ]
61
61
  cld3 = ["cld3-py>=3.1"]
62
+ commonlingua = [
63
+ # CommonLingua is a 2.35M-param byte-level model; needs torch but not the
64
+ # transformers stack that [afrolid] pulls in.
65
+ "torch>=2.4",
66
+ ]
62
67
  leaderboard = [
63
68
  # gradio 4.x imports HfFolder from huggingface_hub, which was removed in
64
69
  # huggingface-hub 1.0; gradio 5 dropped that import.
@@ -88,7 +93,7 @@ notebooks = [
88
93
  "nbclient>=0.10",
89
94
  ]
90
95
  all = [
91
- "commonlid[afrolid,llm]",
96
+ "commonlid[afrolid,llm,commonlingua]",
92
97
  ]
93
98
 
94
99
  [project.scripts]
@@ -208,6 +213,8 @@ omit = [
208
213
  # afrolid needs the heavy `[afrolid]` extra (torch + transformers); not
209
214
  # installed in dev and so exercised only via mocked unit tests.
210
215
  "src/commonlid/models/afrolid.py",
216
+ # commonlingua needs the `[commonlingua]` extra (torch); same precedent.
217
+ "src/commonlid/models/commonlingua.py",
211
218
  ]
212
219
 
213
220
  [tool.coverage.report]
@@ -159,6 +159,21 @@ class Evaluator:
159
159
  )
160
160
  n_with_gold = sum(1 for g in ytrue if g is not None)
161
161
  samples_per_second = (len(ytrue) / elapsed) if elapsed > 0 else 0.0
162
+ # `None` here is meaningful: it tells downstream consumers that the
163
+ # model's support set is undefined (e.g. LLMs), distinct from a model
164
+ # that declared an empty set. Errors during discovery downgrade to
165
+ # the same "unknown" sentinel rather than crashing the run.
166
+ try:
167
+ supported = model.discover_supported_languages()
168
+ except Exception as exc:
169
+ logger.warning(
170
+ "%s discover_supported_languages() raised %s: %s -- recording as None",
171
+ prefix,
172
+ type(exc).__name__,
173
+ exc,
174
+ )
175
+ supported = None
176
+ supported_languages = sorted(supported) if supported is not None else None
162
177
  result = Result(
163
178
  model_id=model.model_id,
164
179
  dataset_id=dataset.dataset_id,
@@ -170,6 +185,7 @@ class Evaluator:
170
185
  limit=self.config.limit,
171
186
  timestamp=datetime.now(timezone.utc).isoformat(),
172
187
  commonlid_version=__version__,
188
+ supported_languages=supported_languages,
173
189
  )
174
190
 
175
191
  run_dir = self.config.output_dir / dataset.dataset_id / model.model_id
@@ -13,12 +13,20 @@ from typing import Any
13
13
  from commonlid.metrics.aggregate import macro_average, micro_average
14
14
  from commonlid.metrics.core import LanguageMetrics
15
15
 
16
- SCHEMA_VERSION = 2
16
+ SCHEMA_VERSION = 3
17
17
 
18
18
 
19
19
  @dataclass(slots=True)
20
20
  class Result:
21
- """Aggregate outcome of one model evaluated on one dataset."""
21
+ """Aggregate outcome of one model evaluated on one dataset.
22
+
23
+ ``supported_languages`` follows a tri-state convention shared with
24
+ :meth:`LIDModel.discover_supported_languages`: ``None`` means the
25
+ model's support set is undefined (e.g. LLM-based models that can be
26
+ prompted for any language), a list of ISO 639-3 codes is the closed
27
+ set the model declares, and an empty list is the degenerate "supports
28
+ zero languages" case. The leaderboard's ``(cov.)`` view consumes this.
29
+ """
22
30
 
23
31
  model_id: str
24
32
  dataset_id: str
@@ -32,6 +40,7 @@ class Result:
32
40
  commonlid_version: str = ""
33
41
  python_version: str = field(default_factory=lambda: sys.version.split()[0])
34
42
  platform: str = field(default_factory=platform.platform)
43
+ supported_languages: list[str] | None = None
35
44
  extra: dict[str, Any] = field(default_factory=dict)
36
45
 
37
46
  def summary(self) -> dict[str, Any]:
@@ -52,6 +61,7 @@ class Result:
52
61
  "macro": macro_average(self.per_language),
53
62
  "micro": micro_average(self.per_language),
54
63
  "per_language": {lang: asdict(m) for lang, m in sorted(self.per_language.items())},
64
+ "supported_languages": self.supported_languages,
55
65
  "extra": self.extra,
56
66
  }
57
67
 
@@ -42,6 +42,20 @@ BLOG_URL = (
42
42
  )
43
43
  PAPER_URL = "https://arxiv.org/abs/2601.18026"
44
44
 
45
+ Scope = Literal["all", "cov"]
46
+
47
+ #: Radio choices shown above each dataset's results table.
48
+ SCOPE_CHOICES: list[tuple[str, Scope]] = [
49
+ ("Scores are calculated over the whole dataset.", "all"),
50
+ (
51
+ "Scores are calculated on the subset of language varieties covered by the model. (cov.)",
52
+ "cov",
53
+ ),
54
+ ]
55
+
56
+ #: Sentinel string used when a row has no cov data (rendered as em-dash).
57
+ _NA_DISPLAY = "—"
58
+
45
59
  #: Display columns in the headline table (in order). Macro F1 is the headline metric.
46
60
  _HEADLINE_COLUMNS: list[tuple[str, str]] = [
47
61
  ("model_id", "Model"),
@@ -51,6 +65,19 @@ _HEADLINE_COLUMNS: list[tuple[str, str]] = [
51
65
  ("n_languages", "Languages"),
52
66
  ("samples_per_second", "Samples/s"),
53
67
  ]
68
+
69
+ #: Same columns, projected from the ``*_cov`` source fields. Display
70
+ #: labels stay identical so the table layout doesn't shift when the
71
+ #: scope radio is toggled.
72
+ _HEADLINE_COLUMNS_COV: list[tuple[str, str]] = [
73
+ ("model_id", "Model"),
74
+ ("macro_f1_cov", "Macro F1"),
75
+ ("micro_f1_cov", "Micro F1"),
76
+ ("mean_fpr_cov", "Mean FPR (%)"),
77
+ ("n_languages_cov", "Languages"),
78
+ ("samples_per_second", "Samples/s"),
79
+ ]
80
+
54
81
  #: Right-aligned numeric columns get the ``number`` Gradio datatype which
55
82
  #: pushes values to the right edge of the cell.
56
83
  _GradioDtype = Literal["str", "number", "bool", "date", "markdown", "html"]
@@ -134,6 +161,46 @@ _DRILLDOWN_COLUMN_HELP: list[tuple[str, str]] = [
134
161
  ]
135
162
 
136
163
 
164
+ #: Per-column human descriptions for the **(cov.)** view — same metrics,
165
+ #: but restricted to the model's declared support set.
166
+ _HEADLINE_COLUMN_HELP_COV: list[tuple[str, str]] = [
167
+ ("Model", "Identifier of the language identification model."),
168
+ (
169
+ "Macro F1",
170
+ "Unweighted mean of per-language F1 (x100) **restricted to languages the "
171
+ "model declares it supports** (paper `(cov.)` definition). Languages outside "
172
+ "the model's support set are excluded from the average — a model that covers "
173
+ "a small but accurate subset of the benchmark is no longer penalised for the "
174
+ "long tail of languages it never claimed to handle. **Higher is better.** "
175
+ f"Models without a declared support set show `{_NA_DISPLAY}`.",
176
+ ),
177
+ (
178
+ "Micro F1",
179
+ "Sample-weighted F1 (x100) pooled over the **model-supported subset** of "
180
+ "gold samples only. **Higher is better.** "
181
+ f"`{_NA_DISPLAY}` when no support set is declared.",
182
+ ),
183
+ (
184
+ "Mean FPR (%)",
185
+ "Mean per-language false-positive rate computed only on samples whose gold "
186
+ "language is in the model's support set; TN counts confusion across other "
187
+ "supported languages, not the long tail. **Lower is better.** "
188
+ f"`{_NA_DISPLAY}` when no support set is declared.",
189
+ ),
190
+ (
191
+ "Languages",
192
+ "Number of model-supported languages that have at least one gold sample in "
193
+ "this dataset (`|supported ∩ gold|`). This is the size of the slice every "
194
+ "other `(cov.)` metric is averaged over.",
195
+ ),
196
+ (
197
+ "Samples/s",
198
+ "Throughput during evaluation (samples processed per second). Unaffected by "
199
+ "the scope toggle — it is a model-property, not a metric.",
200
+ ),
201
+ ]
202
+
203
+
137
204
  def _columns_help_markdown(items: list[tuple[str, str]]) -> str:
138
205
  """Render a (column, description) list as a Markdown bullet block."""
139
206
  return "\n".join(f"- **{label}** — {desc}" for label, desc in items)
@@ -157,30 +224,55 @@ def _styled_value(table: Any, right_align_after_col: int = 0) -> dict[str, Any]:
157
224
  return {"data": data, "headers": headers, "metadata": {"styling": styling}}
158
225
 
159
226
 
160
- def _format_table(df: Any) -> Any:
227
+ def _fmt(value: Any, decimals: int, *, scale: float = 1.0) -> str:
228
+ """Format a numeric value with ``decimals`` precision, em-dash for ``None``/``NaN``."""
229
+ import pandas as pd
230
+
231
+ if value is None or (isinstance(value, float) and pd.isna(value)):
232
+ return _NA_DISPLAY
233
+ return f"{float(value) * scale:.{decimals}f}"
234
+
235
+
236
+ def _format_table(df: Any, scope: Scope = "all") -> Any:
161
237
  """Project + format a results DataFrame for one Gradio tab.
162
238
 
163
239
  Numeric columns are converted to **fixed-decimal strings** (e.g. ``0.00``
164
240
  not ``0``) so the rendered cells line up vertically; sort ordering is
165
- preserved by sorting on the raw ``macro_f1`` *before* formatting.
241
+ preserved by sorting on the raw float *before* formatting.
166
242
 
167
243
  - Macro F1 / Micro F1 / Samples/s use **1 decimal**.
168
244
  - Mean FPR (%) uses **2 decimals**.
245
+ - In ``scope="cov"``, rows without ``supported_languages`` data render
246
+ em-dashes for every cov metric and sort to the bottom.
169
247
  """
170
248
  import pandas as pd
171
249
 
250
+ columns = _HEADLINE_COLUMNS_COV if scope == "cov" else _HEADLINE_COLUMNS
251
+ display_labels = [label for _, label in columns]
172
252
  if df.empty:
173
- return pd.DataFrame(columns=[label for _, label in _HEADLINE_COLUMNS])
253
+ return pd.DataFrame(columns=display_labels)
254
+
174
255
  out = df.copy()
175
- # Sort on the raw float so the resulting order is correct; format only
176
- # afterwards (string sort would order "10" before "9").
177
- out = out.sort_values("macro_f1", ascending=False, kind="stable").reset_index(drop=True)
178
- out["macro_f1"] = (out["macro_f1"] * 100).map(lambda x: f"{x:.1f}")
179
- out["micro_f1"] = (out["micro_f1"] * 100).map(lambda x: f"{x:.1f}")
180
- out["mean_fpr"] = (out["mean_fpr"] * 100).map(lambda x: f"{x:.2f}")
181
- out["samples_per_second"] = out["samples_per_second"].map(lambda x: f"{x:.1f}")
182
- out = out[[k for k, _ in _HEADLINE_COLUMNS]]
183
- out.columns = [label for _, label in _HEADLINE_COLUMNS]
256
+ source = {key: key for key, _ in columns}
257
+ sort_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"]
258
+ # ``na_position="last"`` sinks rows without cov data to the bottom of
259
+ # the (cov.) view; the "all" view has no NaNs in this column.
260
+ out = out.sort_values(sort_key, ascending=False, kind="stable", na_position="last")
261
+ out = out.reset_index(drop=True)
262
+
263
+ macro_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"]
264
+ micro_key = source["micro_f1_cov"] if scope == "cov" else source["micro_f1"]
265
+ fpr_key = source["mean_fpr_cov"] if scope == "cov" else source["mean_fpr"]
266
+ langs_key = source["n_languages_cov"] if scope == "cov" else source["n_languages"]
267
+
268
+ out[macro_key] = out[macro_key].map(lambda x: _fmt(x, 1, scale=100))
269
+ out[micro_key] = out[micro_key].map(lambda x: _fmt(x, 1, scale=100))
270
+ out[fpr_key] = out[fpr_key].map(lambda x: _fmt(x, 2, scale=100))
271
+ out[langs_key] = out[langs_key].map(lambda x: _fmt(x, 0))
272
+ out["samples_per_second"] = out["samples_per_second"].map(lambda x: _fmt(x, 1))
273
+
274
+ out = out[[k for k, _ in columns]]
275
+ out.columns = display_labels
184
276
  return out
185
277
 
186
278
 
@@ -314,23 +406,30 @@ def _format_license(license_name: str, license_url: str | None) -> str:
314
406
 
315
407
  def _make_select_handler(
316
408
  dataset_id: str,
317
- table: Any,
318
409
  snapshot_root: Path,
319
410
  ) -> Any:
320
411
  """Build the row-select callback as a closure over the captured state.
321
412
 
413
+ The callback looks up the clicked row in the *current* table value
414
+ (passed in via Gradio's event arg) so that switching the scope radio
415
+ and then clicking a row drills down the row at its post-toggle
416
+ position, not the row that would have been there before the swap.
417
+
322
418
  Gradio inspects ``__defaults__`` when registering events, and comparing a
323
419
  DataFrame default against a type annotation hits an unimplemented arrow
324
420
  dtype path. A closure keeps the state out of the function signature.
325
421
  """
326
422
 
327
- def _on_select(evt: gr.SelectData) -> tuple[str, Any]:
423
+ def _on_select(table_value: Any, evt: gr.SelectData) -> tuple[str, Any]:
328
424
  if evt.index is None:
329
425
  return ("_Click a row to load per-language metrics._", None)
330
426
  row_idx = evt.index[0] if isinstance(evt.index, list | tuple) else evt.index
331
427
  try:
332
- model_id = table.iloc[row_idx]["Model"]
333
- except (IndexError, KeyError):
428
+ data = table_value.get("data") if isinstance(table_value, dict) else None
429
+ if data is None:
430
+ return ("_Click a row to load per-language metrics._", None)
431
+ model_id = data[row_idx][0]
432
+ except (IndexError, KeyError, TypeError):
334
433
  return ("_Could not resolve clicked row._", None)
335
434
  per_lang = _per_language_drilldown(snapshot_root, dataset_id, model_id)
336
435
  return (
@@ -341,6 +440,19 @@ def _make_select_handler(
341
440
  return _on_select
342
441
 
343
442
 
443
+ def _make_scope_handler(sub_df: Any) -> Any:
444
+ """Build the scope-radio change callback: swap the table data + legend in lockstep."""
445
+
446
+ def _on_change(scope: Scope) -> tuple[Any, str]:
447
+ help_items = _HEADLINE_COLUMN_HELP_COV if scope == "cov" else _HEADLINE_COLUMN_HELP
448
+ return (
449
+ _styled_value(_format_table(sub_df, scope=scope)),
450
+ _columns_help_markdown(help_items),
451
+ )
452
+
453
+ return _on_change
454
+
455
+
344
456
  def build_app(
345
457
  *,
346
458
  repo_id: str = DEFAULT_REPO_ID,
@@ -384,7 +496,7 @@ def build_app(
384
496
  with gr.Tab(label=tab_label):
385
497
  gr.Markdown(_dataset_metadata_markdown(dataset_id))
386
498
  sub = df[df["dataset_id"] == dataset_id]
387
- table = _format_table(sub)
499
+ table = _format_table(sub, scope="all")
388
500
  if table.empty:
389
501
  gr.Markdown(
390
502
  f"_No results for `{dataset_id}` in `{repo_id}` yet."
@@ -394,6 +506,12 @@ def build_app(
394
506
  )
395
507
  continue
396
508
 
509
+ scope_radio = gr.Radio(
510
+ choices=SCOPE_CHOICES,
511
+ value="all",
512
+ label="Scoring scope",
513
+ interactive=True,
514
+ )
397
515
  leaderboard = gr.Dataframe(
398
516
  value=_styled_value(table),
399
517
  datatype=_HEADLINE_DATATYPES,
@@ -402,7 +520,7 @@ def build_app(
402
520
  label=f"{dataset_id} — sorted by Macro F1",
403
521
  )
404
522
  with gr.Accordion("What do these columns mean?", open=False):
405
- gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP))
523
+ legend = gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP))
406
524
  drilldown_label = gr.Markdown("_Click a row to load per-language metrics._")
407
525
  # Seed the drilldown grid with an empty DataFrame so the Component
408
526
  # has stable column headers before the first row click.
@@ -415,8 +533,14 @@ def build_app(
415
533
  with gr.Accordion("What do these per-language columns mean?", open=False):
416
534
  gr.Markdown(_columns_help_markdown(_DRILLDOWN_COLUMN_HELP))
417
535
 
536
+ scope_radio.change(
537
+ _make_scope_handler(sub),
538
+ inputs=[scope_radio],
539
+ outputs=[leaderboard, legend],
540
+ )
418
541
  leaderboard.select(
419
- _make_select_handler(dataset_id, table, snapshot_root),
542
+ _make_select_handler(dataset_id, snapshot_root),
543
+ inputs=[leaderboard],
420
544
  outputs=[drilldown_label, drilldown],
421
545
  )
422
546
  gr.Markdown(footer)
@@ -14,11 +14,15 @@ from __future__ import annotations
14
14
 
15
15
  import json
16
16
  import logging
17
- from collections.abc import Iterable
17
+ import math
18
+ from collections.abc import Iterable, Mapping
18
19
  from dataclasses import asdict, dataclass
19
20
  from pathlib import Path
20
21
  from typing import Any
21
22
 
23
+ from commonlid.metrics.core import LanguageMetrics
24
+ from commonlid.metrics.fpr import mean_false_positive_rate, mean_stats_with_coverage
25
+
22
26
  logger = logging.getLogger(__name__)
23
27
 
24
28
  DEFAULT_REPO_ID = "commoncrawl/commonlid-results"
@@ -40,6 +44,14 @@ class LeaderboardRow:
40
44
  gold set. That's a model-property number, not a paper headline, and
41
45
  it stays consistent across rows: every model is reported on the same
42
46
  "what languages did you actually output here" basis.
47
+
48
+ The ``*_cov`` mirror fields are the same metrics restricted to gold
49
+ samples whose language is in the model's declared support set
50
+ (``supported_languages``). They are ``None`` when no support set is
51
+ available — either the field is missing from ``summary.json`` (legacy
52
+ file), the field is JSON ``null`` (LLM-style models whose support set
53
+ is undefined), or the field is an empty list (degenerate "supports
54
+ zero languages"). All three render as em-dashes in the cov view.
43
55
  """
44
56
 
45
57
  dataset_id: str
@@ -57,6 +69,13 @@ class LeaderboardRow:
57
69
  commonlid_version: str
58
70
  timestamp: str
59
71
  is_imported: bool
72
+ supported_languages: list[str] | None
73
+ macro_f1_cov: float | None
74
+ macro_precision_cov: float | None
75
+ macro_recall_cov: float | None
76
+ micro_f1_cov: float | None
77
+ mean_fpr_cov: float | None
78
+ n_languages_cov: int | None
60
79
 
61
80
  def to_dict(self) -> dict[str, Any]:
62
81
  return asdict(self)
@@ -68,10 +87,107 @@ def _safe_mean_fpr(per_language: dict[str, dict[str, Any]]) -> float:
68
87
  return sum(vals) / len(vals) if vals else 0.0
69
88
 
70
89
 
90
+ def _hydrate_per_language(
91
+ per_language: Mapping[str, Mapping[str, Any]],
92
+ ) -> dict[str, LanguageMetrics]:
93
+ """Reconstruct :class:`LanguageMetrics` objects from the serialised dict form."""
94
+ out: dict[str, LanguageMetrics] = {}
95
+ for lang, m in per_language.items():
96
+ out[lang] = LanguageMetrics(
97
+ gt_count=int(m.get("gt_count", 0)),
98
+ predictions=int(m.get("predictions", 0)),
99
+ correct=int(m.get("correct", 0)),
100
+ precision=float(m.get("precision", 0.0) or 0.0),
101
+ recall=float(m.get("recall", 0.0) or 0.0),
102
+ f1=float(m.get("f1", 0.0) or 0.0),
103
+ fpr=None if m.get("fpr") is None else float(m["fpr"]),
104
+ )
105
+ return out
106
+
107
+
108
+ def _micro_average_over(rows: Mapping[str, LanguageMetrics]) -> tuple[float, float, float]:
109
+ """Pooled precision/recall/F1 over a (filtered) per-language slice.
110
+
111
+ Mirrors :func:`commonlid.metrics.aggregate.micro_average`'s
112
+ ``*_gold_only`` math but accepts a pre-filtered subset, which the
113
+ public helper does not.
114
+ """
115
+ total_correct = sum(m.correct for m in rows.values())
116
+ total_predictions = sum(m.predictions for m in rows.values())
117
+ total_gold = sum(m.gt_count for m in rows.values())
118
+ precision = total_correct / total_predictions if total_predictions > 0 else 0.0
119
+ recall = total_correct / total_gold if total_gold > 0 else 0.0
120
+ f1 = (
121
+ 2 * precision * recall / (precision + recall)
122
+ if (precision + recall) > 0 and not math.isclose(precision + recall, 0.0)
123
+ else 0.0
124
+ )
125
+ return precision, recall, f1
126
+
127
+
128
+ def _compute_cov_fields(
129
+ per_language_raw: Mapping[str, Mapping[str, Any]],
130
+ supported: list[str] | None,
131
+ ) -> dict[str, float | int | None]:
132
+ """Return the six cov-variant fields, or all ``None`` when no cov data.
133
+
134
+ ``supported`` semantics:
135
+
136
+ - ``None`` — model's support set is undefined (e.g. LLM); no cov data.
137
+ - ``[]`` — model declared zero supported languages; every cov metric
138
+ would divide by zero, so render as no-data.
139
+ - non-empty list — compute the cov metrics.
140
+ """
141
+ none_result: dict[str, float | int | None] = {
142
+ "macro_f1_cov": None,
143
+ "macro_precision_cov": None,
144
+ "macro_recall_cov": None,
145
+ "micro_f1_cov": None,
146
+ "mean_fpr_cov": None,
147
+ "n_languages_cov": None,
148
+ }
149
+ if not supported:
150
+ return none_result
151
+
152
+ supported_set = set(supported)
153
+ per_language = _hydrate_per_language(per_language_raw)
154
+ stats = mean_stats_with_coverage(per_language, model_supported_languages=supported_set)
155
+ cov = stats["cov"]
156
+ n_languages_cov = int(cov.get("cov_count", 0))
157
+ if n_languages_cov == 0:
158
+ # Supported set has no overlap with the dataset's gold; nothing
159
+ # meaningful to report.
160
+ return none_result
161
+
162
+ cov_rows = {
163
+ lang: m for lang, m in per_language.items() if m.gt_count > 0 and lang in supported_set
164
+ }
165
+ _micro_precision, _micro_recall, micro_f1 = _micro_average_over(cov_rows)
166
+ mean_fpr_cov = mean_false_positive_rate(per_language, language_whitelist=supported_set)
167
+
168
+ return {
169
+ "macro_f1_cov": float(cov["f1"]),
170
+ "macro_precision_cov": float(cov["precision"]),
171
+ "macro_recall_cov": float(cov["recall"]),
172
+ "micro_f1_cov": float(micro_f1),
173
+ "mean_fpr_cov": float(mean_fpr_cov),
174
+ "n_languages_cov": n_languages_cov,
175
+ }
176
+
177
+
71
178
  def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -> LeaderboardRow:
72
179
  macro = summary.get("macro", {})
73
180
  micro = summary.get("micro", {})
74
181
  extra = summary.get("extra", {}) or {}
182
+ per_language = summary.get("per_language", {}) or {}
183
+
184
+ # Tri-state: missing key, JSON null, or list. Anything else (e.g. an
185
+ # accidentally-serialised set) collapses to "unknown".
186
+ raw_supported = summary.get("supported_languages")
187
+ supported: list[str] | None = list(raw_supported) if isinstance(raw_supported, list) else None
188
+ cov = _compute_cov_fields(per_language, supported)
189
+
190
+ n_languages_cov = cov["n_languages_cov"]
75
191
  return LeaderboardRow(
76
192
  dataset_id=dataset_id,
77
193
  model_id=model_id,
@@ -79,7 +195,7 @@ def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -
79
195
  macro_precision=float(macro.get("precision_gold_only", 0.0)),
80
196
  macro_recall=float(macro.get("recall_gold_only", 0.0)),
81
197
  micro_f1=float(micro.get("f1_gold_only", 0.0)),
82
- mean_fpr=_safe_mean_fpr(summary.get("per_language", {})),
198
+ mean_fpr=_safe_mean_fpr(per_language),
83
199
  n_languages=int(macro.get("n_languages_observed", 0)),
84
200
  n_samples=int(summary.get("n_samples", 0)),
85
201
  n_samples_with_gold=int(summary.get("n_samples_with_gold", 0)),
@@ -88,6 +204,13 @@ def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -
88
204
  commonlid_version=str(summary.get("commonlid_version", "")),
89
205
  timestamp=str(summary.get("timestamp", "")),
90
206
  is_imported=("imported_from" in extra),
207
+ supported_languages=supported,
208
+ macro_f1_cov=cov["macro_f1_cov"],
209
+ macro_precision_cov=cov["macro_precision_cov"],
210
+ macro_recall_cov=cov["macro_recall_cov"],
211
+ micro_f1_cov=cov["micro_f1_cov"],
212
+ mean_fpr_cov=cov["mean_fpr_cov"],
213
+ n_languages_cov=int(n_languages_cov) if n_languages_cov is not None else None,
91
214
  )
92
215
 
93
216
 
@@ -11,6 +11,7 @@ directly if you want to evaluate an LLM.
11
11
  from commonlid.models import afrolid as _afrolid # noqa: F401
12
12
  from commonlid.models import cld2 as _cld2 # noqa: F401
13
13
  from commonlid.models import cld3 as _cld3 # noqa: F401
14
+ from commonlid.models import commonlingua as _commonlingua # noqa: F401
14
15
  from commonlid.models import fasttext_ft as _fasttext_ft # noqa: F401
15
16
  from commonlid.models import funlangid as _funlangid # noqa: F401
16
17
  from commonlid.models import glotlid as _glotlid # noqa: F401