PyPI - jaxcld - Versions diffs - 0.1.0__tar.gz - Mend

jaxcld 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

jaxcld-0.1.0/PKG-INFO +80 -0
jaxcld-0.1.0/README.md +314 -0
jaxcld-0.1.0/README_PIP.md +44 -0
jaxcld-0.1.0/jaxcld/__init__.py +47 -0
jaxcld-0.1.0/jaxcld/models/__init__.py +11 -0
jaxcld-0.1.0/jaxcld/models/asr_model.py +495 -0
jaxcld-0.1.0/jaxcld/models/cvx_grelu_mlp.py +74 -0
jaxcld-0.1.0/jaxcld/models/cvx_mlp.py +63 -0
jaxcld-0.1.0/jaxcld/models/cvx_relu_mlp.py +120 -0
jaxcld-0.1.0/jaxcld/models/get_model.py +26 -0
jaxcld-0.1.0/jaxcld/models/grelu_mlp.py +44 -0
jaxcld-0.1.0/jaxcld/models/lang_detect_head.py +152 -0
jaxcld-0.1.0/jaxcld/models/relu_mlp.py +71 -0
jaxcld-0.1.0/jaxcld/models/two_layer_mlp.py +11 -0
jaxcld-0.1.0/jaxcld/optimizers/__init__.py +4 -0
jaxcld-0.1.0/jaxcld/optimizers/adamW.py +39 -0
jaxcld-0.1.0/jaxcld/optimizers/admm.py +103 -0
jaxcld-0.1.0/jaxcld/optimizers/dadapt_adamW.py +38 -0
jaxcld-0.1.0/jaxcld/optimizers/dist_shampoo/__init__.py +4 -0
jaxcld-0.1.0/jaxcld/optimizers/dist_shampoo/distributed_shampoo.py +2831 -0
jaxcld-0.1.0/jaxcld/optimizers/dist_shampoo/quantization_utils.py +115 -0
jaxcld-0.1.0/jaxcld/optimizers/pcg.py +69 -0
jaxcld-0.1.0/jaxcld/optimizers/sgd.py +38 -0
jaxcld-0.1.0/jaxcld/optimizers/shampoo.py +37 -0
jaxcld-0.1.0/jaxcld/optimizers/yogi.py +36 -0
jaxcld-0.1.0/jaxcld/preconditioner/__init__.py +4 -0
jaxcld-0.1.0/jaxcld/preconditioner/nystrom.py +102 -0
jaxcld-0.1.0/jaxcld/training/__init__.py +8 -0
jaxcld-0.1.0/jaxcld/training/train.py +164 -0
jaxcld-0.1.0/jaxcld/training/train_no_jit.py +126 -0
jaxcld-0.1.0/jaxcld/utils/__init__.py +4 -0
jaxcld-0.1.0/jaxcld/utils/linops_utils.py +50 -0
jaxcld-0.1.0/jaxcld/utils/load_data.py +459 -0
jaxcld-0.1.0/jaxcld/utils/metric_utils.py +59 -0
jaxcld-0.1.0/jaxcld/utils/model_utils.py +113 -0
jaxcld-0.1.0/jaxcld/utils/opt_utils.py +31 -0
jaxcld-0.1.0/jaxcld/utils/proximal_utils.py +22 -0
jaxcld-0.1.0/jaxcld/utils/train_utils.py +7 -0
jaxcld-0.1.0/jaxcld/utils/whisper_dataloader.py +142 -0
jaxcld-0.1.0/jaxcld.egg-info/PKG-INFO +80 -0
jaxcld-0.1.0/jaxcld.egg-info/SOURCES.txt +45 -0
jaxcld-0.1.0/jaxcld.egg-info/dependency_links.txt +1 -0
jaxcld-0.1.0/jaxcld.egg-info/requires.txt +28 -0
jaxcld-0.1.0/jaxcld.egg-info/top_level.txt +1 -0
jaxcld-0.1.0/pyproject.toml +55 -0
jaxcld-0.1.0/setup.cfg +4 -0
jaxcld-0.1.0/tests/test_final_dry_asr_and_heads.py +251 -0

jaxcld-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,80 @@
+Metadata-Version: 2.4
+Name: jaxcld
+Version: 0.1.0
+Summary: CLD: language detection heads for ASR models
+Author: CLD contributors
+License: MIT
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Requires-Dist: numpy>=1.24
+Requires-Dist: torch>=2.0.0
+Requires-Dist: torchaudio>=2.0.0
+Requires-Dist: transformers==4.56.2
+Requires-Dist: scikit-learn>=1.3.0
+Provides-Extra: train
+Requires-Dist: datasets[audio]==3.6.0; extra == "train"
+Requires-Dist: soundfile>=0.12.1; extra == "train"
+Requires-Dist: scipy>=1.10; extra == "train"
+Requires-Dist: tqdm>=4.66; extra == "train"
+Requires-Dist: pandas>=1.5.0; extra == "train"
+Requires-Dist: librosa>=0.10.1; extra == "train"
+Requires-Dist: noisereduce>=3.0.0; extra == "train"
+Requires-Dist: pydub>=0.25.1; extra == "train"
+Requires-Dist: accelerate>=0.20.0; extra == "train"
+Requires-Dist: evaluate>=0.4.0; extra == "train"
+Requires-Dist: jiwer>=3.0.0; extra == "train"
+Requires-Dist: torchcodec==0.10.0; extra == "train"
+Requires-Dist: wandb>=0.15.0; extra == "train"
+Requires-Dist: tensorboard>=2.13.0; extra == "train"
+Requires-Dist: huggingface_hub>=0.17.0; extra == "train"
+Requires-Dist: gradio>=3.0.0; extra == "train"
+Requires-Dist: audiomentations==0.43.1; extra == "train"
+Requires-Dist: jax==0.7.2; extra == "train"
+Requires-Dist: optax==0.2.6; extra == "train"
+Requires-Dist: flax==0.11.2; extra == "train"
+Requires-Dist: python-dotenv==1.1.1; extra == "train"
+## jaxcld
+`jaxcld` is a lightweight language-detection module for multilingual ASR models (Whisper / MMS). It provides an `ASRModel` wrapper plus pluggable language detection heads you can attach at inference time.
+## Install
+```bash
+pip install jaxcld
+```
+If you are developing from source:
+```bash
+pip install -e .
+```
+## Using the package (minimal inference example)
+```python
+import numpy as np
+from jaxcld import ASRModel, CVXNNLangDetectHead, NNLangDetectHead, SVMLangDetectHead
+# 1) Load the base ASR model
+languages = ["en", "hi", "id", "ms", "zh"]
+asr = ASRModel.from_pretrained("openai/whisper-small", config={"languages": languages})
+# 2) Load a language detection head artifact (choose ONE)
+# head = CVXNNLangDetectHead.load("path/to/whisper-small_trained_cvx_mlp.pkl", asr)
+# head = NNLangDetectHead.load("path/to/openai_whisper-small_nn_head.pkl", asr)
+# head = SVMLangDetectHead.load("path/to/openai_whisper-small_linear_svm.pkl", asr)
+# 3) Attach head and run inference
+asr.set_lang_detect_head(head)
+audio_16k_mono: np.ndarray = ...  # shape (T,), sampling rate 16kHz
+pred_langs, pred_texts = asr.predict(audio_16k_mono)
+print(pred_langs[0], pred_texts[0])
+```
+## Notes
+- Head artifacts (`*.pkl`) are produced by training scripts in the source repository; this pip README intentionally focuses only on **package usage**.

jaxcld-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,314 @@
+## Convex Low-resource Accent-Robust Language Detection in Speech Recognition
+This repository provides the official implementation of **CLD**, a lightweight language-detection module for multilingual ASR. This codebase contains our pip-installable Python package (`jaxcld/`) including our training/benchmark scripts implemented in JAX and optimized via ADMM for high performance in low-resource settings. Simply, the package attaches a small language detection head (Convex NN / small NN / linear SVM) to ASR encoder representations, and use it to select the language token (Whisper) or adapter (MMS) before decoding.
+![Approach overview](assets/fig_1_2.png)
+## Highlights
+- High Accuracy: Excels in binary and multiclass language detection (Table 2).
+- Low-Resource Robustness: Effective with limited data (Figures 1 & 2).
+- Efficient: 13x training speedup from traditional NNs due to ADMM optimization and JAX.
+<!--
+## What’s in this repo
+- **`jaxcld/`**: package with `ASRModel` adapters (Whisper + MMS) and language detection heads
+- **Training scripts**
+  - **Whisper fine-tuning**: `train_whisper.py`
+  - **Convex head (CVXNN, JAX + ADMM/CRONOS)**: `train_cvxnn.py`
+  - **Small NN head (PyTorch)**: `train_nn.py`
+  - **Linear SVM head (sklearn)**: `train_linear_svm.py`
+- **Evaluation**: `benchmark_cld.py` (language detection metrics + WER/CER, with optional per-accent breakdown)
+- **Tests**: `tests/` (smoke-tests for loading heads and running inference end-to-end) -->
+## Requirements
+This repo supports two common setups:
+- **Package-only install** (inference usage):
+```bash
+pip install -e .
+```
+- **Full training/benchmark environment** (recommended if you run the scripts in this repo):
+```bash
+pip install -e ".[train]"
+```
+If you prefer installing from the pinned dependency list instead:
+```bash
+pip install -r requirements.txt
+```
+## Pip README (package-only)
+For the pip/PyPI page we use a separate, minimal README focused only on **using** the `jaxcld` package:
+- `README_PIP.md`
+## Using the package
+### Minimal inference example (Whisper)
+```python
+import numpy as np
+from jaxcld import ASRModel, CVXNNLangDetectHead, NNLangDetectHead, SVMLangDetectHead
+# 1) Load the base ASR model
+languages = ["en", "hi", "id", "ms", "zh"]
+asr = ASRModel.from_pretrained("openai/whisper-small", config={"languages": languages})
+# 2) Load a language detection head artifact (choose ONE)
+# head = CVXNNLangDetectHead.load("path/to/whisper-small_trained_cvx_mlp.pkl", asr)
+# head = NNLangDetectHead.load("path/to/openai_whisper-small_nn_head.pkl", asr)
+# head = SVMLangDetectHead.load("path/to/openai_whisper-small_linear_svm.pkl", asr)
+# 3) Attach head and run inference
+asr.set_lang_detect_head(head)
+audio_16k_mono: np.ndarray = ...  # shape (T,), sampling rate 16kHz
+pred_langs, pred_texts = asr.predict(audio_16k_mono)
+print(pred_langs[0], pred_texts[0])
+```
+## Training
+## Data format
+All training/evaluation scripts expect a **Hugging Face `DatasetDict` saved to disk** (loaded via `datasets.load_from_disk(...)`) with splits like `train`, `valid`, `test`. Use our `data_ingestion.py` script to prepare your data.
+```bash
+python data_ingestion.py \
+  --config configs/en_hi_config.json \
+  --out data/en_hi \
+  --common-voice-dir /absolute/path/to/CommonVoice \
+  --augment
+```
+- Required: `--config` JSON (see example below), `--out` save directory.
+- Optional: `--augment` enables audiomentations; `--musan-dir` for background noise; `--common-voice-dir` for local Common Voice.
+- Output: a saved `DatasetDict` at `data/en_hi` with columns: `audio`, `text`, `lang`, `accent`.
+Minimal config example (see more in `configs/`):
+```json
+{
+  "name": "English-Hindi example",
+  "languages": {
+    "en": {
+      "accents": [
+        { "code": "us", "column_name": "United States English", "dataset": "common_voice" }
+      ]
+    },
+    "hi": {
+      "accents": [
+        { "code": "hi", "column_name": "", "dataset": "common_voice" }
+      ]
+    }
+  },
+  "params": {
+    "samples_per_class": 1000,
+    "split": { "train": 0.8, "val": 0.1, "test": 0.1 }
+  }
+}
+```
+Notes:
+- Common Voice selection uses `column_name` against `accents` in `validated.tsv`. Use `override_code` to point to alternative folders (see `configs/final_config.json`).
+- Lahaja examples match by `native_language` (e.g., `"Telugu"`, `"Konkani"`).
+### Train language detection heads
+All heads are trained on **pooled encoder embeddings** extracted by `ASRModel.load_data(...)` from a dataset on disk.
+#### CVXNN (convex head, JAX + ADMM/CRONOS)
+```bash
+python train_cvxnn.py \
+  --model_name openai/whisper-small \
+  --dataset_path data/multiclass \
+  --languages en,hi,id,ms,zh \
+  --output_dir models/lang_heads \
+  --neuron 64 \
+  --beta 0.001 \
+  --rho 0.1 \
+  --admm_iters 6
+```
+This produces a pickled artifact like:
+- `models/lang_heads/openai/whisper-small/openai_whisper-small_trained_cvx_mlp.pkl`
+#### NN head (PyTorch)
+```bash
+python train_nn.py \
+  --dataset_path data/multiclass \
+  --model_name openai/whisper-small \
+  --languages en,hi,id,ms,zh \
+  --output_dir models/lang_heads \
+  --num_train_epochs 10 \
+  --learning_rate 1e-3 \
+  --per_device_train_batch_size 256
+```
+This produces a pickled artifact like:
+- `models/lang_heads/openai/whisper-small/openai_whisper-small_nn_head.pkl`
+#### Linear SVM head (sklearn)
+```bash
+python train_linear_svm.py \
+  --model_name openai/whisper-small \
+  --data_dir data/multiclass \
+  --languages en,hi,id,ms,zh \
+  --output_dir models/lang_heads \
+  --C 1.0 \
+  --max_iter 5000
+```
+This produces a pickled artifact like:
+- `models/lang_heads/openai/whisper-small/openai_whisper-small_linear_svm.pkl`
+#### Fine-tune Whisper
+Use `train_whisper.py` to fine-tune a Whisper checkpoint on a preprocessed dataset directory:
+```bash
+python train_whisper.py \
+  --data_dir data/multiclass \
+  --model_id openai/whisper-small \
+  --output_dir models/whisper-small-finetuned \
+  --num_train_epochs 3 \
+  --learning_rate 1e-5 \
+  --per_device_train_batch_size 8 \
+  --per_device_eval_batch_size 8 \
+  --gradient_accumulation_steps 1 \
+  --eval_strategy steps \
+  --eval_steps 1000 \
+  --save_steps 1000
+```
+Optional logging:
+```bash
+python train_whisper.py ... \
+  --wandb_project CLD \
+  --run_name whisper-small-finetune-final_dry
+```
+## Evaluation
+Use `benchmark_cld.py` to evaluate **language detection** and **transcription quality** (WER/CER) on the `test` split.
+### Whisper + CVXNN head
+```bash
+python benchmark_cld.py \
+  --dataset_path data/multiclass \
+  --model_name openai/whisper-small \
+  --cld_type cvx \
+  --cld_path models/lang_heads/openai/whisper-small/openai_whisper-small_trained_cvx_mlp.pkl \
+  --languages en,hi,id,ms,zh \
+  --batch_size 32 \
+  --no_wandb
+```
+### Whisper + NN head
+```bash
+python benchmark_cld.py \
+  --dataset_path data/multiclass \
+  --model_name openai/whisper-small \
+  --cld_type nn \
+  --cld_path models/lang_heads/openai/whisper-small/openai_whisper-small_nn_head.pkl \
+  --languages en,hi,id,ms,zh \
+  --batch_size 32 \
+  --no_wandb
+```
+### Whisper + linear SVM head
+```bash
+python benchmark_cld.py \
+  --dataset_path data/multiclass \
+  --model_name openai/whisper-small \
+  --cld_type linear_svm \
+  --cld_path models/lang_heads/openai/whisper-small/openai_whisper-small_linear_svm.pkl \
+  --languages en,hi,id,ms,zh \
+  --batch_size 32 \
+  --no_wandb
+```
+### Whisper vanilla language ID (no head)
+```bash
+python benchmark_cld.py \
+  --dataset_path data/multiclass \
+  --model_name openai/whisper-small \
+  --cld_type vanilla \
+  --languages en,hi,id,ms,zh \
+  --batch_size 32 \
+  --no_wandb
+```
+<!-- ## Pre-trained models
+_TBD._ This repo supports loading three head types:
+| Head type | Artifact | Loader |
+| --- | --- | --- |
+| CVXNN | `*_trained_cvx_mlp.pkl` | `CVXNNLangDetectHead.load(...)` |
+| NN | `*_nn_head.pkl` | `NNLangDetectHead.load(...)` |
+| Linear SVM | `*_linear_svm.pkl` | `SVMLangDetectHead.load(...)` | -->
+## Results
+Paper results (Table 5):
+![Table 5](assets/table_5.png)
+To reproduce the evaluation numbers for a given head, run `benchmark_cld.py` as shown in the Evaluation section.
+<!--
+## Tests
+```bash
+pytest -q
+```
+Note: tests are designed to **skip** if the local dataset at `data/test/final_dry/` is missing or if large model weights are unavailable.
+## Contributing
+- **Bugs / features**: please open an issue with a minimal reproduction.
+- **Pull requests**: keep changes focused, add/update tests when behavior changes, and document new scripts/flags in `README.md`.
+## License
+MIT (see `pyproject.toml`). -->
+<!-- ## Citation
+If you use this code in your work, please cite the paper:
+```bibtex
+@article{cld2026,
+  title   = {CLD: Convex Language Detection Heads for Accent-Robust Multilingual ASR},
+  author  = {TBD},
+  journal = {TBD},
+  year    = {2026}
+}
+``` -->
+<!-- ## Questions / missing info (to finalize this README)
+- **Paper metadata**: what is the final paper title, author list, venue, and arXiv/camera-ready link?
+- **Dataset recipe**: how should users reproduce `data/test/final_dry/` from raw sources (which datasets, filtering, splits, and preprocessing)?
+- **Accent labels**: what is the definition/source of the `accent` field (taxonomy + how it’s derived)?
+- **Default language set**: is `en,hi,id,ms,zh` the canonical set, or just the example from your experiments?
+- **Pretrained artifacts**: where should the pretrained Whisper checkpoints and head artifacts be hosted (HF Hub / Google Drive / release assets), and what are the exact filenames?
+- **Reproduction commands**: which exact `train_*` commands correspond to Table 5 (hyperparameters + seeds + compute setup)? -->

jaxcld-0.1.0/README_PIP.md ADDED Viewed

@@ -0,0 +1,44 @@
+## jaxcld
+`jaxcld` is a lightweight language-detection module for multilingual ASR models (Whisper / MMS). It provides an `ASRModel` wrapper plus pluggable language detection heads you can attach at inference time.
+## Install
+```bash
+pip install jaxcld
+```
+If you are developing from source:
+```bash
+pip install -e .
+```
+## Using the package (minimal inference example)
+```python
+import numpy as np
+from jaxcld import ASRModel, CVXNNLangDetectHead, NNLangDetectHead, SVMLangDetectHead
+# 1) Load the base ASR model
+languages = ["en", "hi", "id", "ms", "zh"]
+asr = ASRModel.from_pretrained("openai/whisper-small", config={"languages": languages})
+# 2) Load a language detection head artifact (choose ONE)
+# head = CVXNNLangDetectHead.load("path/to/whisper-small_trained_cvx_mlp.pkl", asr)
+# head = NNLangDetectHead.load("path/to/openai_whisper-small_nn_head.pkl", asr)
+# head = SVMLangDetectHead.load("path/to/openai_whisper-small_linear_svm.pkl", asr)
+# 3) Attach head and run inference
+asr.set_lang_detect_head(head)
+audio_16k_mono: np.ndarray = ...  # shape (T,), sampling rate 16kHz
+pred_langs, pred_texts = asr.predict(audio_16k_mono)
+print(pred_langs[0], pred_texts[0])
+```
+## Notes
+- Head artifacts (`*.pkl`) are produced by training scripts in the source repository; this pip README intentionally focuses only on **package usage**.

jaxcld-0.1.0/jaxcld/__init__.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""
+`jaxcld` package public API.
+The goal is to support:
+from jaxcld import ASRModel, CVXNNLangDetectHead
+"""
+from __future__ import annotations
+__version__ = "0.1.0"
+__all__ = [
+    "ASRModel",
+    "CVXNNLangDetectHead",
+    "NNLangDetectHead",
+    "SVMLangDetectHead",
+]
+def __getattr__(name: str):
+    # Lazy imports so `import jaxcld` works even if optional heavy deps (torch, transformers)
+    # are not installed, while still supporting `from jaxcld import ASRModel, ...` when they are.
+    try:
+        if name == "ASRModel":
+            from .models.asr_model import ASRModel
+            return ASRModel
+        if name == "CVXNNLangDetectHead":
+            from .models.lang_detect_head import CVXNNLangDetectHead
+            return CVXNNLangDetectHead
+        if name == "NNLangDetectHead":
+            from .models.lang_detect_head import NNLangDetectHead
+            return NNLangDetectHead
+        if name == "SVMLangDetectHead":
+            from .models.lang_detect_head import SVMLangDetectHead
+            return SVMLangDetectHead
+    except ModuleNotFoundError as e:
+        raise ImportError(
+            "Missing optional dependency. Install jaxcld with its runtime dependencies, e.g. "
+            "`pip install -e .` (or `pip install .`) and ensure `torch`, `torchaudio`, and "
+            "`transformers` are available."
+        ) from e
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

jaxcld-0.1.0/jaxcld/models/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""Model implementations and language detection heads.
+Note: keep this module light (avoid importing torch/transformers at import time).
+Import symbols from their defining modules directly, e.g.:
+from jaxcld.models.asr_model import ASRModel
+from jaxcld.models.lang_detect_head import CVXNNLangDetectHead
+"""
+__all__ = []