PyPI - zerosyl - Versions diffs - 1.0.0__tar.gz - Mend

zerosyl 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

zerosyl-1.0.0/LICENCE +21 -0
zerosyl-1.0.0/PKG-INFO +35 -0
zerosyl-1.0.0/pyproject.toml +49 -0
zerosyl-1.0.0/zerosyl/__init__.py +13 -0
zerosyl-1.0.0/zerosyl/cli.py +331 -0
zerosyl-1.0.0/zerosyl/eval/bitrate.py +58 -0
zerosyl-1.0.0/zerosyl/eval/boundaries.py +210 -0
zerosyl-1.0.0/zerosyl/eval/clustering.py +96 -0
zerosyl-1.0.0/zerosyl/eval/loglikelihood.py +127 -0
zerosyl-1.0.0/zerosyl/eval/tsc.py +44 -0
zerosyl-1.0.0/zerosyl/lm.py +66 -0
zerosyl-1.0.0/zerosyl/utils/encode.py +412 -0
zerosyl-1.0.0/zerosyl/wavlm/WavLM.py +889 -0
zerosyl-1.0.0/zerosyl/wavlm/modules.py +846 -0
zerosyl-1.0.0/zerosyl/zerosyl.py +323 -0

zerosyl-1.0.0/LICENCE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Nicol Visser
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

zerosyl-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,35 @@
+Metadata-Version: 2.4
+Name: zerosyl
+Version: 1.0.0
+Summary: Simple zero-resource syllable tokenization for spoken language modeling
+License: MIT
+License-File: LICENCE
+Author: nicolvisser
+Author-email: vissernicol@gmail.com
+Requires-Python: >=3.11,<3.15
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Provides-Extra: cli
+Requires-Dist: faiss-cpu (>=1.8.0,<2.0.0)
+Requires-Dist: matplotlib (>=3.10.8,<4.0.0) ; extra == "cli"
+Requires-Dist: matplotlib (>=3.9.0,<4.0.0) ; extra == "cli"
+Requires-Dist: numpy (>=2.0.0,<3.0.0)
+Requires-Dist: pandas (>=2.0.0,<4.0.0) ; extra == "cli"
+Requires-Dist: pandas (>=2.2.2,<4.0.0) ; extra == "cli"
+Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "cli"
+Requires-Dist: scikit-learn (>=1.8.0,<2.0.0) ; extra == "cli"
+Requires-Dist: scipy (>=1.13.0,<2.0.0)
+Requires-Dist: tgt (>=1.5,<2.0) ; extra == "cli"
+Requires-Dist: tgt (>=1.5,<2.0) ; extra == "cli"
+Requires-Dist: torch (>=2.4.1,<3.0.0)
+Requires-Dist: torchaudio (>=2.10.0,<3.0.0) ; extra == "cli"
+Requires-Dist: torchaudio (>=2.4.1,<3.0.0) ; extra == "cli"
+Requires-Dist: torchcodec (>=0.0.1,<1.0.0) ; extra == "cli"
+Requires-Dist: torchcodec (>=0.8.0,<1.0.0) ; extra == "cli"
+Requires-Dist: transformers (>=4.19.0,<6.0.0)
+Requires-Dist: typer (>=0.23.0,<0.24.0) ; extra == "cli"
+Requires-Dist: typer (>=0.23.0,<0.24.0) ; extra == "cli"

zerosyl-1.0.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,49 @@
+[project]
+name = "zerosyl"
+version = "1.0.0"
+description = "Simple zero-resource syllable tokenization for spoken language modeling"
+authors = [
+    {name = "nicolvisser",email = "vissernicol@gmail.com"}
+]
+license = {text = "MIT"}
+requires-python = ">=3.11,<3.15"
+dependencies = [
+    "torch (>=2.4.1,<3.0.0)",
+    "numpy (>=2.0.0,<3.0.0)",
+    "scipy (>=1.13.0,<2.0.0)",
+    "faiss-cpu (>=1.8.0,<2.0.0)",
+    "transformers (>=4.19.0,<6.0.0)"
+]
+[project.optional-dependencies]
+cli = [
+    "typer (>=0.23.0,<0.24.0)",
+    "torchcodec (>=0.0.1,<1.0.0)",
+    "tgt (>=1.5,<2.0)",
+    "pandas (>=2.0.0,<4.0.0)",
+    "torchaudio (>=2.10.0,<3.0.0)",
+    "matplotlib (>=3.10.8,<4.0.0)",
+    "pydub (>=0.25.1,<0.26.0)",
+    "typer (>=0.23.0,<0.24.0)",
+    "torchcodec (>=0.8.0,<1.0.0)",
+    "tgt (>=1.5,<2.0)",
+    "pandas (>=2.2.2,<4.0.0)",
+    "torchaudio (>=2.4.1,<3.0.0)",
+    "matplotlib (>=3.9.0,<4.0.0)",
+    "scikit-learn (>=1.8.0,<2.0.0)"
+]
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"
+[dependency-groups]
+dev = [
+    "ipykernel (>=7.2.0,<8.0.0)",
+    "isort (>=7.0.0,<8.0.0)",
+    "black (>=26.1.0,<27.0.0)",
+    "wandb (>=0.25.0,<0.26.0)"
+]
+[project.scripts]
+zerosyl = "zerosyl.cli:app"

zerosyl-1.0.0/zerosyl/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from .lm import LanguageModel, OPTConfig
+from .wavlm.WavLM import WavLM, WavLMConfig
+from .zerosyl import ZeroSylCollapsed, ZeroSylContinuous, ZeroSylDiscrete
+__all__ = [
+    "WavLM",
+    "WavLMConfig",
+    "ZeroSylCollapsed",
+    "ZeroSylContinuous",
+    "ZeroSylDiscrete",
+    "LanguageModel",
+    "OPTConfig",
+]

zerosyl-1.0.0/zerosyl/cli.py ADDED Viewed

@@ -0,0 +1,331 @@
+import shutil
+import sys
+from pathlib import Path
+from typing import Literal
+import torch
+try:
+    import typer
+    from typing_extensions import Annotated  # Typer often uses this
+except ImportError:
+    print("Error: The ZeroSyl CLI components are not installed.")
+    print("\nPlease reinstall with the [cli] extra:")
+    print('  pip install "zerosyl[cli]"')
+    print("\nOr if you want everything:")
+    print('  pip install "zerosyl[all]"')
+    sys.exit(1)
+def check_dependencies():
+    """Verify that FFmpeg is installed and accessible."""
+    # if shutil.which("ffmpeg") is None:
+    if True:
+        typer.secho(
+            "\n[!] ERROR: FFmpeg not found.",
+            fg=typer.colors.WHITE,
+            bg=typer.colors.RED,
+            bold=True,
+            err=True,
+        )
+        typer.echo("\nZeroSyl requires FFmpeg for audio processing.", err=True)
+        install_cmd = typer.style(
+            "sudo apt install ffmpeg", fg=typer.colors.CYAN, italic=True
+        )
+        typer.echo(f"Please install it using: {install_cmd}", err=True)
+        raise typer.Exit(code=1)
+app = typer.Typer()
+eval_app = typer.Typer(help="Evaluation utilities")
+app.add_typer(eval_app, name="evaluate")
+@app.command()
+def encode(
+    input_dir: Path = typer.Argument(
+        ...,
+        help="Input directory with audio.",
+        exists=True,
+        file_okay=False,
+        dir_okay=True,
+    ),
+    output_dir: Path = typer.Argument(..., help="Output directory for segments."),
+    input_pattern: str = typer.Option(
+        "*.wav", help="Glob pattern to match input audio files."
+    ),
+    output_format: Literal["continuous", "discrete", "collapsed"] = typer.Option(
+        "collapsed",
+        "--output",
+        help='Choose output type: "continuous" or "discrete" or "collapsed".',
+    ),
+    device: str = typer.Option(
+        "cuda" if torch.cuda.is_available() else "cpu",
+        help="Device to run WavLM on (e.g., 'cuda' or 'cpu').",
+    ),
+    batch_size: int = typer.Option(
+        None,
+        min=1,
+        help="Batch size for encoding. If set, a parallel job is started, which is helpful for large datasets. Adjust based on your GPU memory.",
+    ),
+    num_data_workers: int = typer.Option(
+        4,
+        min=1,
+        help="Number of data workers for loading data if running a parallel job.",
+    ),
+    num_logic_workers: int = typer.Option(
+        1,
+        min=1,
+        help="Number of logic workers for boundary detection if running a parallel job.",
+    ),
+):
+    """
+    Segment waveforms in a directory into syllabic segments and save the output as .pt files. Must have ffmpeg installed on your system.
+    -----------------------------------------------------------------------------
+    - If --output is [green]continuous[/green], save the continuous representations.
+    Each output file is a dictionary with keys: "starts", "ends" and "embeddings".
+    "starts" and "ends" are long tensors containing frame indices (divide by 50 Hz to get ).
+    "embeddings" is a float32 tensor with one 1024-dimensional embedding per segment.
+    - If --output is [green]discrete[/green] save the segments with a K-means cluster ID (one of 10,000 types) associated with each segment. One of 10,000 values.
+    Each output file is a torch.long tensor with shape [num_segments, 3]. The columns are: start frame index, end frame index, cluster ID.
+    - If --output is [green]collapsed[/green] save the segments with an K-means cluster ID (one of 9,116 types) associated with each segment, where all the centroids that correspond to silences are mapped to a single id, SIL=9115.
+    Each output file is a torch.long tensor with shape [num_segments, 3]. The columns are: start frame index, end frame index, unit ID.
+    -----------------------------------------------------------------------------
+    """
+    check_dependencies()
+    from zerosyl.utils.encode import encode, encode_parallel
+    if batch_size is None:
+        # run single-threaded encoding
+        encode(
+            input_dir=input_dir,
+            output_dir=output_dir,
+            input_pattern=input_pattern,
+            output_format=output_format,
+            device=device,
+        )
+    else:
+        # run parallel encoding
+        encode_parallel(
+            input_dir=input_dir,
+            output_dir=output_dir,
+            input_pattern=input_pattern,
+            output_format=output_format,
+            device=device,
+            batch_size=batch_size,
+            num_data_workers=num_data_workers,
+            num_logic_workers=num_logic_workers,
+        )
+@eval_app.command()
+def boundaries(
+    segments_dir: Path = typer.Argument(
+        ..., help="Directory containing segment files."
+    ),
+    textgrid_dir: Path = typer.Argument(
+        ..., help="Directory containing TextGrid files."
+    ),
+    segments_pattern: str = typer.Option(
+        "dev*/**/*.pt", help="Glob pattern for segment files."
+    ),
+    textgrid_pattern: str = typer.Option(
+        "dev*/**/*.TextGrid", help="Glob pattern for TextGrid files."
+    ),
+    frame_rate: float = typer.Option(50.0, help="Frame rate (Hz)."),
+    constant_shift: float = typer.Option(
+        0.0, help="Constant time shift applied to boundaries (in seconds)."
+    ),
+    tolerance: float = typer.Option(
+        0.05, help="Tolerance for matching boundaries (in seconds)."
+    ),
+):
+    """
+    Compute the clustering metrics of a segmentation.
+    """
+    from zerosyl.eval.boundaries import evaluate_boundary_metrics
+    if constant_shift == 0.0:
+        typer.echo(
+            "INFO: You started the boundary evaluation without supplying a constant shift."
+        )
+        typer.echo("Just remember, in our paper we tuned the constant shift.")
+    precision, recall, f1, os, rvalue, token_precision, token_recall, token_f1 = (
+        evaluate_boundary_metrics(
+            segments_dir=segments_dir,
+            textgrid_dir=textgrid_dir,
+            segments_pattern=segments_pattern,
+            textgrid_pattern=textgrid_pattern,
+            frame_rate=frame_rate,
+            constant_shift=constant_shift,
+            tolerance=tolerance,
+        )
+    )
+    typer.echo(f"Precision       {precision*100:8.2f}")
+    typer.echo(f"Recall          {recall*100:8.2f}")
+    typer.echo(f"F1              {f1*100:8.2f}")
+    typer.echo(f"OS              {os*100:8.2f}")
+    typer.echo(f"R-value         {rvalue*100:8.2f}")
+    typer.echo()
+    typer.echo(f"Token Precision {token_precision*100:8.2f}")
+    typer.echo(f"Token Recall    {token_recall*100:8.2f}")
+    typer.echo(f"Token F1        {token_f1*100:8.2f}")
+    typer.echo()
+@eval_app.command()
+def bitrate(
+    segments_dir: Path = typer.Argument(
+        ..., help="Directory containing segment files."
+    ),
+    textgrid_dir: Path = typer.Argument(..., help="Directory with TextGrid files."),
+    segments_pattern: str = typer.Option(
+        "dev*/**/*.pt", help="Pattern for segment files."
+    ),
+    textgrid_pattern: str = typer.Option(
+        "dev*/**/*.TextGrid", help="Pattern for TextGrid files."
+    ),
+    vocab_size: int = typer.Option(
+        None, help="Optional vocab size. Otherwise, will infer the vocab size."
+    ),
+):
+    """
+    Compute the bitrate and frequecy of a segmentation.
+    """
+    from zerosyl.eval.bitrate import evaluate_bitrate_and_freq
+    bitrate_value, freq_value = evaluate_bitrate_and_freq(
+        segments_dir=segments_dir,
+        textgrid_dir=textgrid_dir,
+        segments_pattern=segments_pattern,
+        textgrid_pattern=textgrid_pattern,
+        vocab_size=vocab_size,
+    )
+    typer.echo(f"Bitrate:   {bitrate_value:10.2f} bits/s")
+    typer.echo(f"Frequency: {freq_value:10.2f} Hz")
+@eval_app.command()
+def clustering(
+    segments_dir: Path = typer.Argument(
+        ..., help="Directory containing segment files."
+    ),
+    textgrid_dir: Path = typer.Argument(..., help="Directory with TextGrid files."),
+    segments_pattern: str = typer.Option(
+        "dev*/**/*.pt", help="Pattern for segment files."
+    ),
+    textgrid_pattern: str = typer.Option(
+        "dev*/**/*.TextGrid", help="Pattern for TextGrid files."
+    ),
+):
+    """
+    Compute the clustering metrics of a segmentation.
+    """
+    from zerosyl.eval.clustering import evaluate_clustering_metrics
+    per_cluster_purity, per_syllable_purity, snmi = evaluate_clustering_metrics(
+        segments_dir=segments_dir,
+        textgrid_dir=textgrid_dir,
+        segments_pattern=segments_pattern,
+        textgrid_pattern=textgrid_pattern,
+    )
+    typer.echo(f"Per-cluster purity:              {per_cluster_purity:10.4f}")
+    typer.echo(f"Per-syllable purity:             {per_syllable_purity:10.4f}")
+    typer.echo(f"Syllable-normalized mutual info: {snmi:10.4f}")
+@eval_app.command()
+def loglikelihoods(
+    segments_dir: Path = typer.Argument(
+        ...,
+        exists=True,
+        file_okay=False,
+        dir_okay=True,
+        help="Directory containing .pt segment files.",
+    ),
+    output_path: Path = typer.Argument(
+        ...,
+        help="Where to write the log-likelihood text file.",
+    ),
+    checkpoint_path: str = typer.Option(
+        None,
+        help="Optional path or URL to the model checkpoint. Leave empty to download the default model remotely.",
+    ),
+    batch_size: int = typer.Option(
+        64,
+        min=1,
+        help="Batch size for evaluation.",
+    ),
+    num_workers: int = typer.Option(
+        8,
+        min=0,
+        help="Number of dataloader worker processes.",
+    ),
+    segments_pattern: str = typer.Option(
+        "*.pt", help="Glob pattern to match segment files."
+    ),
+    normalize: bool = typer.Option(
+        False, "--normalize", help="Normalize the loglikelihoods by token count"
+    ),
+):
+    """
+    Compute log-likelihoods for a directory of unit segment files using a ULM checkpoint.
+    - Output is a text file where each line contains: <segment-file-stem> <log-likelihood>.
+    - Supports optional checkpoint: if none is provided, the model is downloaded remotely.
+    """
+    from zerosyl.eval.loglikelihood import compute_loglikelihoods
+    if output_path.suffix.lower() != ".txt":
+        raise typer.BadParameter("output_path must have a .txt extension")
+    compute_loglikelihoods(
+        segments_dir=segments_dir,
+        checkpoint_path=checkpoint_path,
+        output_path=output_path,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        segments_pattern=segments_pattern,
+        normalize=normalize,
+    )
+@eval_app.command()
+def tsc(
+    loglikelihoods_file: Path = typer.Argument(
+        ...,
+        exists=True,
+        file_okay=True,
+        dir_okay=False,
+        help="Path to the log-likelihoods .txt file.",
+    )
+):
+    """
+    Compute the TSC metric from a log-likelihoods file.
+    - The log-likelihoods file should be a text file where each line contains: <segment-file-stem> <log-likelihood>.
+    """
+    from zerosyl.eval.tsc import eval_tsc
+    if loglikelihoods_file.suffix.lower() != ".txt":
+        raise typer.BadParameter("loglikelihoods_file must have a .txt extension")
+    score = eval_tsc(loglikelihoods_file)
+    typer.echo(f"TSC Score: {score:.4f}")

zerosyl-1.0.0/zerosyl/eval/bitrate.py ADDED Viewed

@@ -0,0 +1,58 @@
+from pathlib import Path
+import numpy as np
+import tgt
+import torch
+from rich.progress import track
+def evaluate_bitrate_and_freq(
+    segments_dir: str | Path,
+    textgrid_dir: str | Path,
+    segments_pattern: str = "dev*/**/*.pt",
+    textgrid_pattern: str = "dev*/**/*.TextGrid",
+    vocab_size: int | None = None,
+):
+    segments_dir = Path(segments_dir)
+    textgrid_dir = Path(textgrid_dir)
+    segments_paths = sorted(segments_dir.glob(segments_pattern))
+    textgrid_paths = sorted(textgrid_dir.glob(textgrid_pattern))
+    assert len(segments_paths) > 0
+    assert len(textgrid_paths) > 0
+    assert len(segments_paths) == len(textgrid_paths)
+    for sp, tp in zip(segments_paths, textgrid_paths):
+        assert sp.stem == tp.stem
+    all_ids = []
+    all_durations = []
+    for sp, tp in track(
+        zip(segments_paths, textgrid_paths),
+        description="Calculating...",
+        total=len(segments_paths),
+    ):
+        # fetch ids from segment file
+        segments = torch.load(sp)
+        starts, ends, ids = segments.T.numpy()
+        all_ids.append(ids)
+        # fetch duration from textgrid
+        textgrid = tgt.read_textgrid(tp, include_empty_intervals=True)
+        duration = textgrid.end_time - textgrid.start_time
+        all_durations.append(duration)
+    all_ids = np.concat(all_ids, axis=0)
+    total_duration = np.sum(all_durations)
+    if vocab_size is None:
+        vocab_size = all_ids.max() + 1
+    counts = np.bincount(all_ids, minlength=vocab_size)
+    probs = counts / sum(counts) + 1e-10
+    entropy = -np.sum(probs * np.log2(probs))
+    bitrate = len(all_ids) * entropy / total_duration
+    freq = len(all_ids) / total_duration
+    return bitrate, freq