PyPI - commonlid - Versions diffs - 0.2.2__tar.gz → 0.2.3__tar.gz - Mend

commonlid 0.2.2tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

{commonlid-0.2.2 → commonlid-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: commonlid
-Version: 0.2.2
+Version: 0.2.3
 Summary: Evaluate language identification models on CommonLID and other benchmarks.
 Project-URL: Homepage, https://huggingface.co/datasets/commoncrawl/CommonLID
 Project-URL: Paper, https://arxiv.org/abs/2601.18026
@@ -247,6 +247,8 @@ Requires-Dist: torch>=2.4; extra == 'all'
 Requires-Dist: transformers<5,>=4.46; extra == 'all'
 Provides-Extra: cld3
 Requires-Dist: cld3-py>=3.1; extra == 'cld3'
+Provides-Extra: commonlingua
+Requires-Dist: torch>=2.4; extra == 'commonlingua'
 Provides-Extra: dev
 Requires-Dist: azure-identity>=1.17; extra == 'dev'
 Requires-Dist: botocore>=1.35; extra == 'dev'
@@ -315,6 +317,7 @@ From PyPI:
 pip install commonlid                      # core deps + classical LID models
 pip install "commonlid[llm]"               # + DSPy-based LLM evaluation
 pip install "commonlid[afrolid]"           # + torch/transformers for AfroLID
+pip install "commonlid[commonlingua]"      # + torch for the CommonLingua byte-level model
 pip install "commonlid[notebooks]"         # + jupyterlab + matplotlib for paper_tables.ipynb
 pip install "commonlid[all]"               # everything runtime-facing
 ```
@@ -468,7 +471,7 @@ from commonlid import list_models, list_datasets
 assert list_models() == [
     "AfroLID", "GlotLID", "OpenLID-v2", "cld2", "cld3",
-    "fasttext", "funlangid", "pyfranc",
+    "commonlingua", "fasttext", "funlangid", "pyfranc",
 ]
 assert list_datasets() == [
     "bibles_300", "bibles_300_nano",
@@ -574,6 +577,7 @@ for line in preds_path.read_text().splitlines():
 | `fasttext` | [facebook/fasttext-language-identification](https://huggingface.co/facebook/fasttext-language-identification) | fasttext |
 | `pyfranc` | [pyfranc](https://pypi.org/project/pyfranc/) | Pure Python |
 | `AfroLID` | [UBC-NLP/afrolid_1.5](https://huggingface.co/UBC-NLP/afrolid_1.5) | Requires `[afrolid]` extra |
+| `commonlingua` | [PleIAs/CommonLingua](https://huggingface.co/PleIAs/CommonLingua) | 2.35M-param byte-level model, 334 languages; requires `[commonlingua]` extra |
 | `funlangid` | Vendored in `src/commonlid/vendor/fun_langid.py` | Simple char-4gram baseline |
 LLM models are instantiated dynamically (`DSPyLLMModel`) and not

{commonlid-0.2.2 → commonlid-0.2.3}/README.md RENAMED Viewed

@@ -39,6 +39,7 @@ From PyPI:
 pip install commonlid                      # core deps + classical LID models
 pip install "commonlid[llm]"               # + DSPy-based LLM evaluation
 pip install "commonlid[afrolid]"           # + torch/transformers for AfroLID
+pip install "commonlid[commonlingua]"      # + torch for the CommonLingua byte-level model
 pip install "commonlid[notebooks]"         # + jupyterlab + matplotlib for paper_tables.ipynb
 pip install "commonlid[all]"               # everything runtime-facing
 ```
@@ -192,7 +193,7 @@ from commonlid import list_models, list_datasets
 assert list_models() == [
     "AfroLID", "GlotLID", "OpenLID-v2", "cld2", "cld3",
-    "fasttext", "funlangid", "pyfranc",
+    "commonlingua", "fasttext", "funlangid", "pyfranc",
 ]
 assert list_datasets() == [
     "bibles_300", "bibles_300_nano",
@@ -298,6 +299,7 @@ for line in preds_path.read_text().splitlines():
 | `fasttext` | [facebook/fasttext-language-identification](https://huggingface.co/facebook/fasttext-language-identification) | fasttext |
 | `pyfranc` | [pyfranc](https://pypi.org/project/pyfranc/) | Pure Python |
 | `AfroLID` | [UBC-NLP/afrolid_1.5](https://huggingface.co/UBC-NLP/afrolid_1.5) | Requires `[afrolid]` extra |
+| `commonlingua` | [PleIAs/CommonLingua](https://huggingface.co/PleIAs/CommonLingua) | 2.35M-param byte-level model, 334 languages; requires `[commonlingua]` extra |
 | `funlangid` | Vendored in `src/commonlid/vendor/fun_langid.py` | Simple char-4gram baseline |
 LLM models are instantiated dynamically (`DSPyLLMModel`) and not

{commonlid-0.2.2 → commonlid-0.2.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "commonlid"
-version = "0.2.2"
+version = "0.2.3"
 description = "Evaluate language identification models on CommonLID and other benchmarks."
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -59,6 +59,11 @@ llm = [
     "botocore>=1.35",
 ]
 cld3 = ["cld3-py>=3.1"]
+commonlingua = [
+    # CommonLingua is a 2.35M-param byte-level model; needs torch but not the
+    # transformers stack that [afrolid] pulls in.
+    "torch>=2.4",
+]
 leaderboard = [
     # gradio 4.x imports HfFolder from huggingface_hub, which was removed in
     # huggingface-hub 1.0; gradio 5 dropped that import.
@@ -88,7 +93,7 @@ notebooks = [
     "nbclient>=0.10",
 ]
 all = [
-    "commonlid[afrolid,llm]",
+    "commonlid[afrolid,llm,commonlingua]",
 ]
 [project.scripts]
@@ -208,6 +213,8 @@ omit = [
     # afrolid needs the heavy `[afrolid]` extra (torch + transformers); not
     # installed in dev and so exercised only via mocked unit tests.
     "src/commonlid/models/afrolid.py",
+    # commonlingua needs the `[commonlingua]` extra (torch); same precedent.
+    "src/commonlid/models/commonlingua.py",
 ]
 [tool.coverage.report]

{commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/evaluation/evaluator.py RENAMED Viewed

@@ -159,6 +159,21 @@ class Evaluator:
         )
         n_with_gold = sum(1 for g in ytrue if g is not None)
         samples_per_second = (len(ytrue) / elapsed) if elapsed > 0 else 0.0
+        # `None` here is meaningful: it tells downstream consumers that the
+        # model's support set is undefined (e.g. LLMs), distinct from a model
+        # that declared an empty set. Errors during discovery downgrade to
+        # the same "unknown" sentinel rather than crashing the run.
+        try:
+            supported = model.discover_supported_languages()
+        except Exception as exc:
+            logger.warning(
+                "%s   discover_supported_languages() raised %s: %s -- recording as None",
+                prefix,
+                type(exc).__name__,
+                exc,
+            )
+            supported = None
+        supported_languages = sorted(supported) if supported is not None else None
         result = Result(
             model_id=model.model_id,
             dataset_id=dataset.dataset_id,
@@ -170,6 +185,7 @@ class Evaluator:
             limit=self.config.limit,
             timestamp=datetime.now(timezone.utc).isoformat(),
             commonlid_version=__version__,
+            supported_languages=supported_languages,
         )
         run_dir = self.config.output_dir / dataset.dataset_id / model.model_id

{commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/evaluation/results.py RENAMED Viewed

@@ -13,12 +13,20 @@ from typing import Any
 from commonlid.metrics.aggregate import macro_average, micro_average
 from commonlid.metrics.core import LanguageMetrics
-SCHEMA_VERSION = 2
+SCHEMA_VERSION = 3
 @dataclass(slots=True)
 class Result:
-    """Aggregate outcome of one model evaluated on one dataset."""
+    """Aggregate outcome of one model evaluated on one dataset.
+    ``supported_languages`` follows a tri-state convention shared with
+    :meth:`LIDModel.discover_supported_languages`: ``None`` means the
+    model's support set is undefined (e.g. LLM-based models that can be
+    prompted for any language), a list of ISO 639-3 codes is the closed
+    set the model declares, and an empty list is the degenerate "supports
+    zero languages" case. The leaderboard's ``(cov.)`` view consumes this.
+    """
     model_id: str
     dataset_id: str
@@ -32,6 +40,7 @@ class Result:
     commonlid_version: str = ""
     python_version: str = field(default_factory=lambda: sys.version.split()[0])
     platform: str = field(default_factory=platform.platform)
+    supported_languages: list[str] | None = None
     extra: dict[str, Any] = field(default_factory=dict)
     def summary(self) -> dict[str, Any]:
@@ -52,6 +61,7 @@ class Result:
             "macro": macro_average(self.per_language),
             "micro": micro_average(self.per_language),
             "per_language": {lang: asdict(m) for lang, m in sorted(self.per_language.items())},
+            "supported_languages": self.supported_languages,
             "extra": self.extra,
         }

{commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/leaderboard/app.py RENAMED Viewed

@@ -42,6 +42,20 @@ BLOG_URL = (
 )
 PAPER_URL = "https://arxiv.org/abs/2601.18026"
+Scope = Literal["all", "cov"]
+#: Radio choices shown above each dataset's results table.
+SCOPE_CHOICES: list[tuple[str, Scope]] = [
+    ("Scores are calculated over the whole dataset.", "all"),
+    (
+        "Scores are calculated on the subset of language varieties covered by the model. (cov.)",
+        "cov",
+    ),
+]
+#: Sentinel string used when a row has no cov data (rendered as em-dash).
+_NA_DISPLAY = "—"
 #: Display columns in the headline table (in order). Macro F1 is the headline metric.
 _HEADLINE_COLUMNS: list[tuple[str, str]] = [
     ("model_id", "Model"),
@@ -51,6 +65,19 @@ _HEADLINE_COLUMNS: list[tuple[str, str]] = [
     ("n_languages", "Languages"),
     ("samples_per_second", "Samples/s"),
 ]
+#: Same columns, projected from the ``*_cov`` source fields. Display
+#: labels stay identical so the table layout doesn't shift when the
+#: scope radio is toggled.
+_HEADLINE_COLUMNS_COV: list[tuple[str, str]] = [
+    ("model_id", "Model"),
+    ("macro_f1_cov", "Macro F1"),
+    ("micro_f1_cov", "Micro F1"),
+    ("mean_fpr_cov", "Mean FPR (%)"),
+    ("n_languages_cov", "Languages"),
+    ("samples_per_second", "Samples/s"),
+]
 #: Right-aligned numeric columns get the ``number`` Gradio datatype which
 #: pushes values to the right edge of the cell.
 _GradioDtype = Literal["str", "number", "bool", "date", "markdown", "html"]
@@ -134,6 +161,46 @@ _DRILLDOWN_COLUMN_HELP: list[tuple[str, str]] = [
 ]
+#: Per-column human descriptions for the **(cov.)** view — same metrics,
+#: but restricted to the model's declared support set.
+_HEADLINE_COLUMN_HELP_COV: list[tuple[str, str]] = [
+    ("Model", "Identifier of the language identification model."),
+    (
+        "Macro F1",
+        "Unweighted mean of per-language F1 (x100) **restricted to languages the "
+        "model declares it supports** (paper `(cov.)` definition). Languages outside "
+        "the model's support set are excluded from the average — a model that covers "
+        "a small but accurate subset of the benchmark is no longer penalised for the "
+        "long tail of languages it never claimed to handle. **Higher is better.** "
+        f"Models without a declared support set show `{_NA_DISPLAY}`.",
+    ),
+    (
+        "Micro F1",
+        "Sample-weighted F1 (x100) pooled over the **model-supported subset** of "
+        "gold samples only. **Higher is better.** "
+        f"`{_NA_DISPLAY}` when no support set is declared.",
+    ),
+    (
+        "Mean FPR (%)",
+        "Mean per-language false-positive rate computed only on samples whose gold "
+        "language is in the model's support set; TN counts confusion across other "
+        "supported languages, not the long tail. **Lower is better.** "
+        f"`{_NA_DISPLAY}` when no support set is declared.",
+    ),
+    (
+        "Languages",
+        "Number of model-supported languages that have at least one gold sample in "
+        "this dataset (`|supported ∩ gold|`). This is the size of the slice every "
+        "other `(cov.)` metric is averaged over.",
+    ),
+    (
+        "Samples/s",
+        "Throughput during evaluation (samples processed per second). Unaffected by "
+        "the scope toggle — it is a model-property, not a metric.",
+    ),
+]
 def _columns_help_markdown(items: list[tuple[str, str]]) -> str:
     """Render a (column, description) list as a Markdown bullet block."""
     return "\n".join(f"- **{label}** — {desc}" for label, desc in items)
@@ -157,30 +224,55 @@ def _styled_value(table: Any, right_align_after_col: int = 0) -> dict[str, Any]:
     return {"data": data, "headers": headers, "metadata": {"styling": styling}}
-def _format_table(df: Any) -> Any:
+def _fmt(value: Any, decimals: int, *, scale: float = 1.0) -> str:
+    """Format a numeric value with ``decimals`` precision, em-dash for ``None``/``NaN``."""
+    import pandas as pd
+    if value is None or (isinstance(value, float) and pd.isna(value)):
+        return _NA_DISPLAY
+    return f"{float(value) * scale:.{decimals}f}"
+def _format_table(df: Any, scope: Scope = "all") -> Any:
     """Project + format a results DataFrame for one Gradio tab.
     Numeric columns are converted to **fixed-decimal strings** (e.g. ``0.00``
     not ``0``) so the rendered cells line up vertically; sort ordering is
-    preserved by sorting on the raw ``macro_f1`` *before* formatting.
+    preserved by sorting on the raw float *before* formatting.
     - Macro F1 / Micro F1 / Samples/s use **1 decimal**.
     - Mean FPR (%) uses **2 decimals**.
+    - In ``scope="cov"``, rows without ``supported_languages`` data render
+      em-dashes for every cov metric and sort to the bottom.
     """
     import pandas as pd
+    columns = _HEADLINE_COLUMNS_COV if scope == "cov" else _HEADLINE_COLUMNS
+    display_labels = [label for _, label in columns]
     if df.empty:
-        return pd.DataFrame(columns=[label for _, label in _HEADLINE_COLUMNS])
+        return pd.DataFrame(columns=display_labels)
     out = df.copy()
-    # Sort on the raw float so the resulting order is correct; format only
-    # afterwards (string sort would order "10" before "9").
-    out = out.sort_values("macro_f1", ascending=False, kind="stable").reset_index(drop=True)
-    out["macro_f1"] = (out["macro_f1"] * 100).map(lambda x: f"{x:.1f}")
-    out["micro_f1"] = (out["micro_f1"] * 100).map(lambda x: f"{x:.1f}")
-    out["mean_fpr"] = (out["mean_fpr"] * 100).map(lambda x: f"{x:.2f}")
-    out["samples_per_second"] = out["samples_per_second"].map(lambda x: f"{x:.1f}")
-    out = out[[k for k, _ in _HEADLINE_COLUMNS]]
-    out.columns = [label for _, label in _HEADLINE_COLUMNS]
+    source = {key: key for key, _ in columns}
+    sort_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"]
+    # ``na_position="last"`` sinks rows without cov data to the bottom of
+    # the (cov.) view; the "all" view has no NaNs in this column.
+    out = out.sort_values(sort_key, ascending=False, kind="stable", na_position="last")
+    out = out.reset_index(drop=True)
+    macro_key = source["macro_f1_cov"] if scope == "cov" else source["macro_f1"]
+    micro_key = source["micro_f1_cov"] if scope == "cov" else source["micro_f1"]
+    fpr_key = source["mean_fpr_cov"] if scope == "cov" else source["mean_fpr"]
+    langs_key = source["n_languages_cov"] if scope == "cov" else source["n_languages"]
+    out[macro_key] = out[macro_key].map(lambda x: _fmt(x, 1, scale=100))
+    out[micro_key] = out[micro_key].map(lambda x: _fmt(x, 1, scale=100))
+    out[fpr_key] = out[fpr_key].map(lambda x: _fmt(x, 2, scale=100))
+    out[langs_key] = out[langs_key].map(lambda x: _fmt(x, 0))
+    out["samples_per_second"] = out["samples_per_second"].map(lambda x: _fmt(x, 1))
+    out = out[[k for k, _ in columns]]
+    out.columns = display_labels
     return out
@@ -314,23 +406,30 @@ def _format_license(license_name: str, license_url: str | None) -> str:
 def _make_select_handler(
     dataset_id: str,
-    table: Any,
     snapshot_root: Path,
 ) -> Any:
     """Build the row-select callback as a closure over the captured state.
+    The callback looks up the clicked row in the *current* table value
+    (passed in via Gradio's event arg) so that switching the scope radio
+    and then clicking a row drills down the row at its post-toggle
+    position, not the row that would have been there before the swap.
     Gradio inspects ``__defaults__`` when registering events, and comparing a
     DataFrame default against a type annotation hits an unimplemented arrow
     dtype path. A closure keeps the state out of the function signature.
     """
-    def _on_select(evt: gr.SelectData) -> tuple[str, Any]:
+    def _on_select(table_value: Any, evt: gr.SelectData) -> tuple[str, Any]:
         if evt.index is None:
             return ("_Click a row to load per-language metrics._", None)
         row_idx = evt.index[0] if isinstance(evt.index, list | tuple) else evt.index
         try:
-            model_id = table.iloc[row_idx]["Model"]
-        except (IndexError, KeyError):
+            data = table_value.get("data") if isinstance(table_value, dict) else None
+            if data is None:
+                return ("_Click a row to load per-language metrics._", None)
+            model_id = data[row_idx][0]
+        except (IndexError, KeyError, TypeError):
             return ("_Could not resolve clicked row._", None)
         per_lang = _per_language_drilldown(snapshot_root, dataset_id, model_id)
         return (
@@ -341,6 +440,19 @@ def _make_select_handler(
     return _on_select
+def _make_scope_handler(sub_df: Any) -> Any:
+    """Build the scope-radio change callback: swap the table data + legend in lockstep."""
+    def _on_change(scope: Scope) -> tuple[Any, str]:
+        help_items = _HEADLINE_COLUMN_HELP_COV if scope == "cov" else _HEADLINE_COLUMN_HELP
+        return (
+            _styled_value(_format_table(sub_df, scope=scope)),
+            _columns_help_markdown(help_items),
+        )
+    return _on_change
 def build_app(
     *,
     repo_id: str = DEFAULT_REPO_ID,
@@ -384,7 +496,7 @@ def build_app(
                 with gr.Tab(label=tab_label):
                     gr.Markdown(_dataset_metadata_markdown(dataset_id))
                     sub = df[df["dataset_id"] == dataset_id]
-                    table = _format_table(sub)
+                    table = _format_table(sub, scope="all")
                     if table.empty:
                         gr.Markdown(
                             f"_No results for `{dataset_id}` in `{repo_id}` yet."
@@ -394,6 +506,12 @@ def build_app(
                         )
                         continue
+                    scope_radio = gr.Radio(
+                        choices=SCOPE_CHOICES,
+                        value="all",
+                        label="Scoring scope",
+                        interactive=True,
+                    )
                     leaderboard = gr.Dataframe(
                         value=_styled_value(table),
                         datatype=_HEADLINE_DATATYPES,
@@ -402,7 +520,7 @@ def build_app(
                         label=f"{dataset_id} — sorted by Macro F1",
                     )
                     with gr.Accordion("What do these columns mean?", open=False):
-                        gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP))
+                        legend = gr.Markdown(_columns_help_markdown(_HEADLINE_COLUMN_HELP))
                     drilldown_label = gr.Markdown("_Click a row to load per-language metrics._")
                     # Seed the drilldown grid with an empty DataFrame so the Component
                     # has stable column headers before the first row click.
@@ -415,8 +533,14 @@ def build_app(
                     with gr.Accordion("What do these per-language columns mean?", open=False):
                         gr.Markdown(_columns_help_markdown(_DRILLDOWN_COLUMN_HELP))
+                    scope_radio.change(
+                        _make_scope_handler(sub),
+                        inputs=[scope_radio],
+                        outputs=[leaderboard, legend],
+                    )
                     leaderboard.select(
-                        _make_select_handler(dataset_id, table, snapshot_root),
+                        _make_select_handler(dataset_id, snapshot_root),
+                        inputs=[leaderboard],
                         outputs=[drilldown_label, drilldown],
                     )
         gr.Markdown(footer)

{commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/leaderboard/data.py RENAMED Viewed

@@ -14,11 +14,15 @@ from __future__ import annotations
 import json
 import logging
-from collections.abc import Iterable
+import math
+from collections.abc import Iterable, Mapping
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Any
+from commonlid.metrics.core import LanguageMetrics
+from commonlid.metrics.fpr import mean_false_positive_rate, mean_stats_with_coverage
 logger = logging.getLogger(__name__)
 DEFAULT_REPO_ID = "commoncrawl/commonlid-results"
@@ -40,6 +44,14 @@ class LeaderboardRow:
     gold set. That's a model-property number, not a paper headline, and
     it stays consistent across rows: every model is reported on the same
     "what languages did you actually output here" basis.
+    The ``*_cov`` mirror fields are the same metrics restricted to gold
+    samples whose language is in the model's declared support set
+    (``supported_languages``). They are ``None`` when no support set is
+    available — either the field is missing from ``summary.json`` (legacy
+    file), the field is JSON ``null`` (LLM-style models whose support set
+    is undefined), or the field is an empty list (degenerate "supports
+    zero languages"). All three render as em-dashes in the cov view.
     """
     dataset_id: str
@@ -57,6 +69,13 @@ class LeaderboardRow:
     commonlid_version: str
     timestamp: str
     is_imported: bool
+    supported_languages: list[str] | None
+    macro_f1_cov: float | None
+    macro_precision_cov: float | None
+    macro_recall_cov: float | None
+    micro_f1_cov: float | None
+    mean_fpr_cov: float | None
+    n_languages_cov: int | None
     def to_dict(self) -> dict[str, Any]:
         return asdict(self)
@@ -68,10 +87,107 @@ def _safe_mean_fpr(per_language: dict[str, dict[str, Any]]) -> float:
     return sum(vals) / len(vals) if vals else 0.0
+def _hydrate_per_language(
+    per_language: Mapping[str, Mapping[str, Any]],
+) -> dict[str, LanguageMetrics]:
+    """Reconstruct :class:`LanguageMetrics` objects from the serialised dict form."""
+    out: dict[str, LanguageMetrics] = {}
+    for lang, m in per_language.items():
+        out[lang] = LanguageMetrics(
+            gt_count=int(m.get("gt_count", 0)),
+            predictions=int(m.get("predictions", 0)),
+            correct=int(m.get("correct", 0)),
+            precision=float(m.get("precision", 0.0) or 0.0),
+            recall=float(m.get("recall", 0.0) or 0.0),
+            f1=float(m.get("f1", 0.0) or 0.0),
+            fpr=None if m.get("fpr") is None else float(m["fpr"]),
+        )
+    return out
+def _micro_average_over(rows: Mapping[str, LanguageMetrics]) -> tuple[float, float, float]:
+    """Pooled precision/recall/F1 over a (filtered) per-language slice.
+    Mirrors :func:`commonlid.metrics.aggregate.micro_average`'s
+    ``*_gold_only`` math but accepts a pre-filtered subset, which the
+    public helper does not.
+    """
+    total_correct = sum(m.correct for m in rows.values())
+    total_predictions = sum(m.predictions for m in rows.values())
+    total_gold = sum(m.gt_count for m in rows.values())
+    precision = total_correct / total_predictions if total_predictions > 0 else 0.0
+    recall = total_correct / total_gold if total_gold > 0 else 0.0
+    f1 = (
+        2 * precision * recall / (precision + recall)
+        if (precision + recall) > 0 and not math.isclose(precision + recall, 0.0)
+        else 0.0
+    )
+    return precision, recall, f1
+def _compute_cov_fields(
+    per_language_raw: Mapping[str, Mapping[str, Any]],
+    supported: list[str] | None,
+) -> dict[str, float | int | None]:
+    """Return the six cov-variant fields, or all ``None`` when no cov data.
+    ``supported`` semantics:
+    - ``None`` — model's support set is undefined (e.g. LLM); no cov data.
+    - ``[]`` — model declared zero supported languages; every cov metric
+      would divide by zero, so render as no-data.
+    - non-empty list — compute the cov metrics.
+    """
+    none_result: dict[str, float | int | None] = {
+        "macro_f1_cov": None,
+        "macro_precision_cov": None,
+        "macro_recall_cov": None,
+        "micro_f1_cov": None,
+        "mean_fpr_cov": None,
+        "n_languages_cov": None,
+    }
+    if not supported:
+        return none_result
+    supported_set = set(supported)
+    per_language = _hydrate_per_language(per_language_raw)
+    stats = mean_stats_with_coverage(per_language, model_supported_languages=supported_set)
+    cov = stats["cov"]
+    n_languages_cov = int(cov.get("cov_count", 0))
+    if n_languages_cov == 0:
+        # Supported set has no overlap with the dataset's gold; nothing
+        # meaningful to report.
+        return none_result
+    cov_rows = {
+        lang: m for lang, m in per_language.items() if m.gt_count > 0 and lang in supported_set
+    }
+    _micro_precision, _micro_recall, micro_f1 = _micro_average_over(cov_rows)
+    mean_fpr_cov = mean_false_positive_rate(per_language, language_whitelist=supported_set)
+    return {
+        "macro_f1_cov": float(cov["f1"]),
+        "macro_precision_cov": float(cov["precision"]),
+        "macro_recall_cov": float(cov["recall"]),
+        "micro_f1_cov": float(micro_f1),
+        "mean_fpr_cov": float(mean_fpr_cov),
+        "n_languages_cov": n_languages_cov,
+    }
 def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -> LeaderboardRow:
     macro = summary.get("macro", {})
     micro = summary.get("micro", {})
     extra = summary.get("extra", {}) or {}
+    per_language = summary.get("per_language", {}) or {}
+    # Tri-state: missing key, JSON null, or list. Anything else (e.g. an
+    # accidentally-serialised set) collapses to "unknown".
+    raw_supported = summary.get("supported_languages")
+    supported: list[str] | None = list(raw_supported) if isinstance(raw_supported, list) else None
+    cov = _compute_cov_fields(per_language, supported)
+    n_languages_cov = cov["n_languages_cov"]
     return LeaderboardRow(
         dataset_id=dataset_id,
         model_id=model_id,
@@ -79,7 +195,7 @@ def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -
         macro_precision=float(macro.get("precision_gold_only", 0.0)),
         macro_recall=float(macro.get("recall_gold_only", 0.0)),
         micro_f1=float(micro.get("f1_gold_only", 0.0)),
-        mean_fpr=_safe_mean_fpr(summary.get("per_language", {})),
+        mean_fpr=_safe_mean_fpr(per_language),
         n_languages=int(macro.get("n_languages_observed", 0)),
         n_samples=int(summary.get("n_samples", 0)),
         n_samples_with_gold=int(summary.get("n_samples_with_gold", 0)),
@@ -88,6 +204,13 @@ def _row_from_summary(summary: dict[str, Any], dataset_id: str, model_id: str) -
         commonlid_version=str(summary.get("commonlid_version", "")),
         timestamp=str(summary.get("timestamp", "")),
         is_imported=("imported_from" in extra),
+        supported_languages=supported,
+        macro_f1_cov=cov["macro_f1_cov"],
+        macro_precision_cov=cov["macro_precision_cov"],
+        macro_recall_cov=cov["macro_recall_cov"],
+        micro_f1_cov=cov["micro_f1_cov"],
+        mean_fpr_cov=cov["mean_fpr_cov"],
+        n_languages_cov=int(n_languages_cov) if n_languages_cov is not None else None,
     )

{commonlid-0.2.2 → commonlid-0.2.3}/src/commonlid/models/__init__.py RENAMED Viewed

@@ -11,6 +11,7 @@ directly if you want to evaluate an LLM.
 from commonlid.models import afrolid as _afrolid  # noqa: F401
 from commonlid.models import cld2 as _cld2  # noqa: F401
 from commonlid.models import cld3 as _cld3  # noqa: F401
+from commonlid.models import commonlingua as _commonlingua  # noqa: F401
 from commonlid.models import fasttext_ft as _fasttext_ft  # noqa: F401
 from commonlid.models import funlangid as _funlangid  # noqa: F401
 from commonlid.models import glotlid as _glotlid  # noqa: F401

commonlid 0.2.2__tar.gz → 0.2.3__tar.gz

commonlid 0.2.2tar.gz → 0.2.3tar.gz