commonlid 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. commonlid/__init__.py +38 -0
  2. commonlid/cli.py +457 -0
  3. commonlid/core/__init__.py +24 -0
  4. commonlid/core/lid_dataset.py +249 -0
  5. commonlid/core/lid_model.py +104 -0
  6. commonlid/core/registry.py +78 -0
  7. commonlid/datasets/__init__.py +9 -0
  8. commonlid/datasets/bibles.py +87 -0
  9. commonlid/datasets/commonlid.py +26 -0
  10. commonlid/datasets/flores_dev.py +27 -0
  11. commonlid/datasets/nano.py +177 -0
  12. commonlid/datasets/smolsent.py +73 -0
  13. commonlid/datasets/social_media.py +27 -0
  14. commonlid/datasets/udhr.py +27 -0
  15. commonlid/datasets_tools/__init__.py +13 -0
  16. commonlid/datasets_tools/frequency_sample.py +36 -0
  17. commonlid/datasets_tools/stratified_sample.py +108 -0
  18. commonlid/evaluation/__init__.py +14 -0
  19. commonlid/evaluation/cache.py +79 -0
  20. commonlid/evaluation/evaluator.py +233 -0
  21. commonlid/evaluation/results.py +84 -0
  22. commonlid/leaderboard/__init__.py +25 -0
  23. commonlid/leaderboard/app.py +427 -0
  24. commonlid/leaderboard/data.py +154 -0
  25. commonlid/logging.py +17 -0
  26. commonlid/metrics/__init__.py +39 -0
  27. commonlid/metrics/aggregate.py +147 -0
  28. commonlid/metrics/core.py +167 -0
  29. commonlid/metrics/fpr.py +189 -0
  30. commonlid/metrics/support_matrix.py +52 -0
  31. commonlid/models/__init__.py +21 -0
  32. commonlid/models/_fasttext_base.py +147 -0
  33. commonlid/models/afrolid.py +69 -0
  34. commonlid/models/cld2.py +48 -0
  35. commonlid/models/cld3.py +181 -0
  36. commonlid/models/dspy_llm.py +251 -0
  37. commonlid/models/fasttext_ft.py +12 -0
  38. commonlid/models/funlangid.py +49 -0
  39. commonlid/models/glotlid.py +12 -0
  40. commonlid/models/openlidv2.py +12 -0
  41. commonlid/models/pyfranc.py +35 -0
  42. commonlid/preprocess/__init__.py +15 -0
  43. commonlid/preprocess/langcodes.py +158 -0
  44. commonlid/preprocess/openlid_normer.py +23 -0
  45. commonlid/py.typed +0 -0
  46. commonlid/vendor/__init__.py +1 -0
  47. commonlid/vendor/fun_langid.py +9967 -0
  48. commonlid-0.2.0.dist-info/METADATA +910 -0
  49. commonlid-0.2.0.dist-info/RECORD +52 -0
  50. commonlid-0.2.0.dist-info/WHEEL +4 -0
  51. commonlid-0.2.0.dist-info/entry_points.txt +2 -0
  52. commonlid-0.2.0.dist-info/licenses/LICENSE +201 -0
commonlid/__init__.py ADDED
@@ -0,0 +1,38 @@
1
+ """CommonLID — language identification model/benchmark evaluation."""
2
+
3
+ # Import the submodule packages so every shipped model/dataset registers itself
4
+ # on bare ``import commonlid``. These imports are side-effect-only (the heavy
5
+ # dependencies — fasttext weights, transformers, dspy — load lazily inside each
6
+ # model's ``load()``), so this stays cheap.
7
+ from commonlid import datasets as _tasks # noqa: F401
8
+ from commonlid import models as _models # noqa: F401
9
+ from commonlid.core.lid_dataset import LIDDataset, PrivateDatasetAccessError
10
+ from commonlid.core.lid_model import LIDModel, LIDPrediction
11
+ from commonlid.core.registry import (
12
+ get_dataset,
13
+ get_model,
14
+ list_datasets,
15
+ list_models,
16
+ register_dataset,
17
+ register_model,
18
+ )
19
+ from commonlid.evaluation.evaluator import Evaluator
20
+ from commonlid.evaluation.results import Result
21
+
22
+ __version__ = "0.1.0"
23
+
24
+ __all__ = [
25
+ "Evaluator",
26
+ "LIDDataset",
27
+ "LIDModel",
28
+ "LIDPrediction",
29
+ "PrivateDatasetAccessError",
30
+ "Result",
31
+ "__version__",
32
+ "get_dataset",
33
+ "get_model",
34
+ "list_datasets",
35
+ "list_models",
36
+ "register_dataset",
37
+ "register_model",
38
+ ]
commonlid/cli.py ADDED
@@ -0,0 +1,457 @@
1
+ """Typer command-line interface.
2
+
3
+ This is a thin facade over :mod:`commonlid.evaluation.evaluator` and the
4
+ registry. All heavy lifting happens in the package.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import csv
10
+ import json
11
+ import logging
12
+ import sys
13
+ from collections.abc import Iterator
14
+ from pathlib import Path
15
+ from typing import Annotated, Any
16
+
17
+ import typer
18
+
19
+ from commonlid import __version__
20
+ from commonlid.core.registry import (
21
+ get_dataset,
22
+ get_model,
23
+ list_datasets,
24
+ list_models,
25
+ )
26
+ from commonlid.evaluation.evaluator import Evaluator
27
+ from commonlid.evaluation.results import load_summary
28
+ from commonlid.logging import setup_logging
29
+ from commonlid.metrics.support_matrix import save_support_matrix
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ app = typer.Typer(
34
+ name="commonlid",
35
+ help="Evaluate language identification models on CommonLID and other benchmarks.",
36
+ no_args_is_help=True,
37
+ add_completion=False,
38
+ )
39
+
40
+
41
+ def _ensure_registry_loaded() -> None:
42
+ """Import the subpackages whose submodule imports populate the registries."""
43
+ import commonlid.datasets # noqa: F401
44
+ import commonlid.models # noqa: F401
45
+
46
+
47
+ @app.callback()
48
+ def _main(
49
+ verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable debug logging.")] = False,
50
+ ) -> None:
51
+ setup_logging(verbose=verbose)
52
+ _ensure_registry_loaded()
53
+
54
+
55
+ @app.command("version")
56
+ def version_cmd() -> None:
57
+ """Print the installed commonlid version."""
58
+ typer.echo(__version__)
59
+
60
+
61
+ @app.command("list-models")
62
+ def list_models_cmd(
63
+ as_json: Annotated[bool, typer.Option("--json", help="Output JSON instead of text.")] = False,
64
+ ) -> None:
65
+ """List all registered LID models."""
66
+ ids = list_models()
67
+ if as_json:
68
+ typer.echo(json.dumps(ids))
69
+ else:
70
+ for model_id in ids:
71
+ typer.echo(model_id)
72
+
73
+
74
+ @app.command("list-datasets")
75
+ def list_datasets_cmd(
76
+ as_json: Annotated[bool, typer.Option("--json", help="Output JSON instead of text.")] = False,
77
+ ) -> None:
78
+ """List all registered LID evaluation datasets."""
79
+ ids = list_datasets()
80
+ if as_json:
81
+ typer.echo(json.dumps(ids))
82
+ else:
83
+ for dataset_id in ids:
84
+ typer.echo(dataset_id)
85
+
86
+
87
+ DSPY_SPEC_PREFIX = "dspy:"
88
+
89
+
90
+ @app.command()
91
+ def run(
92
+ model: Annotated[
93
+ list[str],
94
+ typer.Option(
95
+ "--model",
96
+ "-m",
97
+ help=(
98
+ "Model id (repeat to add more). Use 'dspy:<llm-model-name>' "
99
+ "(e.g. 'dspy:azure/gpt-4o-mini') to evaluate an LLM via DSPy."
100
+ ),
101
+ ),
102
+ ],
103
+ dataset: Annotated[
104
+ list[str], typer.Option("--dataset", "-d", help="Dataset id (repeat to add more).")
105
+ ],
106
+ output_dir: Annotated[
107
+ Path,
108
+ typer.Option("--output-dir", "-o", help="Directory to write results into."),
109
+ ] = Path("./results"),
110
+ batch_size: Annotated[int, typer.Option("--batch-size")] = 64,
111
+ limit: Annotated[int, typer.Option("--limit", help="Cap samples (0 = no limit).")] = 0,
112
+ no_cache: Annotated[bool, typer.Option("--no-cache", help="Disable prediction cache.")] = False,
113
+ sample_count_threshold: Annotated[
114
+ int,
115
+ typer.Option("--sample-threshold", help="Skip langs with fewer gold samples than this."),
116
+ ] = 0,
117
+ # --- DSPy LLM flags (only used when one of --model is 'dspy:...') ---
118
+ api_base: Annotated[
119
+ str | None,
120
+ typer.Option("--api-base", help="LLM provider base URL (required by dspy: models)."),
121
+ ] = None,
122
+ api_version: Annotated[str | None, typer.Option("--api-version")] = None,
123
+ api_key: Annotated[str | None, typer.Option("--api-key")] = None,
124
+ azure_ad_token: Annotated[
125
+ bool, typer.Option("--azure-ad-token", help="Use Azure DefaultAzureCredential.")
126
+ ] = False,
127
+ temperature: Annotated[float | None, typer.Option("--temperature")] = None,
128
+ max_tokens: Annotated[int | None, typer.Option("--max-tokens")] = None,
129
+ max_completion_tokens: Annotated[int | None, typer.Option("--max-completion-tokens")] = None,
130
+ llm_n_threads: Annotated[
131
+ int,
132
+ typer.Option("--llm-n-threads", help="Threads in the DSPy evaluator for LLM models."),
133
+ ] = 1,
134
+ ) -> None:
135
+ """Run the evaluator over the requested (model, dataset) pairs.
136
+
137
+ Mixes registry-backed classical LID models with DSPy-backed LLMs:
138
+ pass ``--model GlotLID`` for a registered model or
139
+ ``--model dspy:azure/gpt-4o-mini`` to spin up a DSPy LLM on the fly.
140
+ """
141
+ llm_kwargs = {
142
+ "api_base": api_base,
143
+ "api_version": api_version,
144
+ "api_key": api_key,
145
+ "azure_ad_token": azure_ad_token,
146
+ "temperature": temperature,
147
+ "max_tokens": max_tokens,
148
+ "max_completion_tokens": max_completion_tokens,
149
+ "batch_size": batch_size,
150
+ "n_threads": llm_n_threads,
151
+ "cache_dir": output_dir / ".dspy_cache",
152
+ }
153
+ models = [_resolve_model_spec(spec, llm_kwargs) for spec in model]
154
+ datasets = [get_dataset(d) for d in dataset]
155
+ Evaluator(
156
+ models=models,
157
+ datasets=datasets,
158
+ output_dir=output_dir,
159
+ batch_size=batch_size,
160
+ use_cache=not no_cache,
161
+ limit=limit if limit > 0 else None,
162
+ sample_count_threshold=sample_count_threshold,
163
+ ).run()
164
+
165
+
166
+ def _resolve_model_spec(spec: str, llm_kwargs: dict[str, Any]) -> Any:
167
+ """Resolve a CLI --model spec to a loaded :class:`LIDModel` instance."""
168
+ if spec.startswith(DSPY_SPEC_PREFIX):
169
+ llm_model_name = spec.removeprefix(DSPY_SPEC_PREFIX)
170
+ if not llm_model_name:
171
+ msg = "'dspy:' model spec requires a model name, e.g. 'dspy:azure/gpt-4o-mini'"
172
+ raise typer.BadParameter(msg)
173
+ if not llm_kwargs.get("api_base"):
174
+ msg = "DSPy LLM models require --api-base (e.g. your Azure endpoint URL)"
175
+ raise typer.BadParameter(msg)
176
+ from commonlid.models.dspy_llm import DSPyLLMModel
177
+
178
+ return DSPyLLMModel(llm_model_name=llm_model_name, **llm_kwargs)
179
+ return get_model(spec)
180
+
181
+
182
+ @app.command()
183
+ def predict(
184
+ model: Annotated[str, typer.Option("--model", "-m", help="Model id.")],
185
+ text: Annotated[str | None, typer.Option("--text", help="A single input string.")] = None,
186
+ text_file: Annotated[
187
+ Path | None,
188
+ typer.Option("--text-file", help="Newline-delimited text file ('-' for stdin)."),
189
+ ] = None,
190
+ ) -> None:
191
+ """Run a model against ad-hoc text, writing JSONL predictions to stdout."""
192
+ if text is None and text_file is None:
193
+ typer.echo("Either --text or --text-file must be provided.", err=True)
194
+ raise typer.Exit(code=2)
195
+
196
+ texts: list[str] = []
197
+ if text is not None:
198
+ texts.append(text)
199
+ if text_file is not None:
200
+ texts.extend(_read_lines(text_file))
201
+
202
+ lid_model = get_model(model)
203
+ preds = lid_model.predict(texts)
204
+ for t, p in zip(texts, preds, strict=True):
205
+ typer.echo(json.dumps({"text": t, "pred": p, "model": model}))
206
+
207
+
208
+ @app.command("generate-support-matrix")
209
+ def generate_support_matrix(
210
+ out: Annotated[Path, typer.Option("--out", help="Destination CSV path.")],
211
+ models: Annotated[
212
+ list[str] | None,
213
+ typer.Option(
214
+ "--model",
215
+ "-m",
216
+ help="Restrict to these model ids (repeatable). Default: every registered model.",
217
+ ),
218
+ ] = None,
219
+ ) -> None:
220
+ """Build the language x model support CSV by asking each model to enumerate its languages.
221
+
222
+ Models that cannot enumerate a concrete language list (e.g. LLMs, or cld3
223
+ when its optional bindings are not installed) are reported as skipped and
224
+ absent from the CSV. Heavy models (AfroLID, fasttext) will download
225
+ weights on first run.
226
+ """
227
+ ids = models if models else list_models()
228
+ matrix: dict[str, set[str]] = {}
229
+ skipped: list[tuple[str, str]] = []
230
+ for model_id in ids:
231
+ try:
232
+ model = get_model(model_id)
233
+ supported = model.discover_supported_languages()
234
+ except Exception as exc:
235
+ skipped.append((model_id, f"{type(exc).__name__}: {exc}"))
236
+ continue
237
+ if supported is None:
238
+ skipped.append((model_id, "discover_supported_languages() returned None"))
239
+ continue
240
+ matrix[model_id] = set(supported)
241
+
242
+ if not matrix:
243
+ typer.echo("No models produced a language list.", err=True)
244
+ raise typer.Exit(code=1)
245
+
246
+ save_support_matrix(matrix, out)
247
+ typer.echo(f"Wrote support matrix for {len(matrix)} model(s) to {out}")
248
+ for model_id, reason in skipped:
249
+ typer.echo(f" skipped {model_id}: {reason}", err=True)
250
+
251
+
252
+ leaderboard_app = typer.Typer(
253
+ name="leaderboard",
254
+ help="Serve the CommonLID leaderboard or push results to the HF dataset.",
255
+ no_args_is_help=True,
256
+ )
257
+ app.add_typer(leaderboard_app, name="leaderboard")
258
+
259
+
260
+ @leaderboard_app.command("serve")
261
+ def leaderboard_serve(
262
+ repo_id: Annotated[
263
+ str,
264
+ typer.Option(
265
+ "--repo-id",
266
+ help="HF dataset repo id holding <dataset>/<model>/summary.json files.",
267
+ ),
268
+ ] = "commoncrawl/commonlid-results",
269
+ revision: Annotated[
270
+ str | None,
271
+ typer.Option("--revision", help="Optional commit SHA / branch / tag to pin."),
272
+ ] = None,
273
+ cache_dir: Annotated[
274
+ Path | None,
275
+ typer.Option("--cache-dir", help="Override the HF snapshot cache directory."),
276
+ ] = None,
277
+ local_dir: Annotated[
278
+ Path | None,
279
+ typer.Option(
280
+ "--local-dir",
281
+ help="Skip the network and read summaries from this local directory instead.",
282
+ ),
283
+ ] = None,
284
+ server_name: Annotated[
285
+ str, typer.Option("--server-name", help="Bind address (set 0.0.0.0 for HF Spaces).")
286
+ ] = "127.0.0.1",
287
+ server_port: Annotated[int, typer.Option("--port", help="Port to bind.")] = 7860,
288
+ share: Annotated[
289
+ bool, typer.Option("--share/--no-share", help="Expose a public gradio.live URL.")
290
+ ] = False,
291
+ ) -> None:
292
+ """Launch the CommonLID leaderboard Gradio app (requires the [leaderboard] extra)."""
293
+ try:
294
+ from commonlid.leaderboard.app import build_app
295
+ except ImportError as exc:
296
+ typer.echo(
297
+ "leaderboard requires `commonlid[leaderboard]` (gradio + pandas). "
298
+ f"Install it via `uv sync --extra leaderboard`. (underlying error: {exc})",
299
+ err=True,
300
+ )
301
+ raise typer.Exit(code=2) from exc
302
+
303
+ demo = build_app(
304
+ repo_id=repo_id,
305
+ revision=revision,
306
+ cache_dir=cache_dir,
307
+ local_dir=local_dir,
308
+ )
309
+ demo.launch(server_name=server_name, server_port=server_port, share=share)
310
+
311
+
312
+ @leaderboard_app.command("upload")
313
+ def leaderboard_upload(
314
+ repo_id: Annotated[
315
+ str,
316
+ typer.Option(
317
+ "--repo-id",
318
+ help="HF dataset repo id (e.g. commoncrawl/commonlid-results).",
319
+ ),
320
+ ],
321
+ local_dir: Annotated[
322
+ Path,
323
+ typer.Option(
324
+ "--local-dir",
325
+ help="Local directory to upload (e.g. ./data/results).",
326
+ ),
327
+ ],
328
+ commit_message: Annotated[
329
+ str | None,
330
+ typer.Option("--commit-message", help="PR title; defaults to a refresh message."),
331
+ ] = None,
332
+ commit_description: Annotated[
333
+ str | None,
334
+ typer.Option("--commit-description", help="Optional PR body."),
335
+ ] = None,
336
+ exclude: Annotated[
337
+ list[str] | None,
338
+ typer.Option(
339
+ "--exclude",
340
+ help="Extra glob to skip (repeatable). `.cache/**` is always excluded.",
341
+ ),
342
+ ] = None,
343
+ skip_predictions: Annotated[
344
+ bool,
345
+ typer.Option(
346
+ "--skip-predictions",
347
+ help="Skip per-row predictions.jsonl files (keeps repo small; the leaderboard only reads summary.json).",
348
+ ),
349
+ ] = False,
350
+ revision: Annotated[
351
+ str | None,
352
+ typer.Option("--revision", help="Base branch for the PR (defaults to the repo's default)."),
353
+ ] = None,
354
+ ) -> None:
355
+ """Push a results folder to the HF dataset as a Pull Request.
356
+
357
+ Folder layout must be ``<dataset_id>/<model_id>/summary.json`` so the
358
+ leaderboard can read it. The upload is opened as a PR (never pushed
359
+ directly to the default branch) so the dataset owner can review before
360
+ merging.
361
+ """
362
+ if not local_dir.is_dir():
363
+ typer.echo(f"--local-dir {local_dir} not found or not a directory.", err=True)
364
+ raise typer.Exit(code=2)
365
+
366
+ try:
367
+ from huggingface_hub import HfApi
368
+ except ImportError as exc: # pragma: no cover - hub is a base dep
369
+ typer.echo(f"huggingface-hub is required: {exc}", err=True)
370
+ raise typer.Exit(code=2) from exc
371
+
372
+ ignore_patterns = [".cache/**", *(exclude or [])]
373
+ if skip_predictions:
374
+ ignore_patterns.append("**/predictions.jsonl")
375
+
376
+ msg = commit_message or f"Refresh results from {local_dir.name}"
377
+ logger.info("Uploading %s -> %s (PR; ignore=%s)", local_dir, repo_id, ignore_patterns)
378
+ commit_info = HfApi().upload_folder(
379
+ folder_path=str(local_dir),
380
+ repo_id=repo_id,
381
+ repo_type="dataset",
382
+ revision=revision,
383
+ ignore_patterns=ignore_patterns,
384
+ commit_message=msg,
385
+ commit_description=commit_description,
386
+ create_pr=True,
387
+ )
388
+ pr_url = getattr(commit_info, "pr_url", None) or str(commit_info)
389
+ typer.echo(f"Opened PR: {pr_url}")
390
+
391
+
392
+ @app.command("export-csv")
393
+ def export_csv(
394
+ results_dir: Annotated[
395
+ Path, typer.Option("--results-dir", help="Directory containing summary.json files.")
396
+ ],
397
+ out: Annotated[Path, typer.Option("--out", help="Destination CSV file.")],
398
+ ) -> None:
399
+ """Walk a results directory and flatten every per-language summary into one CSV."""
400
+ rows = list(_iter_summary_rows(results_dir))
401
+ if not rows:
402
+ typer.echo(f"No summary.json files found under {results_dir}", err=True)
403
+ raise typer.Exit(code=1)
404
+
405
+ fieldnames = [
406
+ "dataset_id",
407
+ "model_id",
408
+ "language",
409
+ "gt_count",
410
+ "predictions",
411
+ "correct",
412
+ "precision",
413
+ "recall",
414
+ "f1",
415
+ "samples_per_second",
416
+ "timestamp",
417
+ "commonlid_version",
418
+ ]
419
+ out.parent.mkdir(parents=True, exist_ok=True)
420
+ with out.open("w", newline="", encoding="utf-8") as f:
421
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
422
+ writer.writeheader()
423
+ for row in rows:
424
+ writer.writerow(row)
425
+ typer.echo(f"Wrote {len(rows)} rows to {out}")
426
+
427
+
428
+ def _read_lines(path: Path) -> list[str]:
429
+ if str(path) == "-":
430
+ return [line.rstrip("\n") for line in sys.stdin if line.strip()]
431
+ return [
432
+ line.rstrip("\n") for line in path.read_text(encoding="utf-8").splitlines() if line.strip()
433
+ ]
434
+
435
+
436
+ def _iter_summary_rows(results_dir: Path) -> Iterator[dict[str, Any]]:
437
+ for summary_path in sorted(results_dir.rglob("summary.json")):
438
+ summary = load_summary(summary_path)
439
+ for language, m in summary.get("per_language", {}).items():
440
+ yield {
441
+ "dataset_id": summary["dataset_id"],
442
+ "model_id": summary["model_id"],
443
+ "language": language,
444
+ "gt_count": m["gt_count"],
445
+ "predictions": m["predictions"],
446
+ "correct": m["correct"],
447
+ "precision": m["precision"],
448
+ "recall": m["recall"],
449
+ "f1": m["f1"],
450
+ "samples_per_second": summary.get("samples_per_second"),
451
+ "timestamp": summary.get("timestamp"),
452
+ "commonlid_version": summary.get("commonlid_version"),
453
+ }
454
+
455
+
456
+ if __name__ == "__main__": # pragma: no cover
457
+ app()
@@ -0,0 +1,24 @@
1
+ """Abstract base classes and the model/dataset registry."""
2
+
3
+ from commonlid.core.lid_dataset import LIDDataset
4
+ from commonlid.core.lid_model import LIDModel, LIDPrediction
5
+ from commonlid.core.registry import (
6
+ get_dataset,
7
+ get_model,
8
+ list_datasets,
9
+ list_models,
10
+ register_dataset,
11
+ register_model,
12
+ )
13
+
14
+ __all__ = [
15
+ "LIDDataset",
16
+ "LIDModel",
17
+ "LIDPrediction",
18
+ "get_dataset",
19
+ "get_model",
20
+ "list_datasets",
21
+ "list_models",
22
+ "register_dataset",
23
+ "register_model",
24
+ ]