zerosyl 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
zerosyl-1.0.0/LICENCE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nicol Visser
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
zerosyl-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,35 @@
1
+ Metadata-Version: 2.4
2
+ Name: zerosyl
3
+ Version: 1.0.0
4
+ Summary: Simple zero-resource syllable tokenization for spoken language modeling
5
+ License: MIT
6
+ License-File: LICENCE
7
+ Author: nicolvisser
8
+ Author-email: vissernicol@gmail.com
9
+ Requires-Python: >=3.11,<3.15
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
16
+ Provides-Extra: cli
17
+ Requires-Dist: faiss-cpu (>=1.8.0,<2.0.0)
18
+ Requires-Dist: matplotlib (>=3.10.8,<4.0.0) ; extra == "cli"
19
+ Requires-Dist: matplotlib (>=3.9.0,<4.0.0) ; extra == "cli"
20
+ Requires-Dist: numpy (>=2.0.0,<3.0.0)
21
+ Requires-Dist: pandas (>=2.0.0,<4.0.0) ; extra == "cli"
22
+ Requires-Dist: pandas (>=2.2.2,<4.0.0) ; extra == "cli"
23
+ Requires-Dist: pydub (>=0.25.1,<0.26.0) ; extra == "cli"
24
+ Requires-Dist: scikit-learn (>=1.8.0,<2.0.0) ; extra == "cli"
25
+ Requires-Dist: scipy (>=1.13.0,<2.0.0)
26
+ Requires-Dist: tgt (>=1.5,<2.0) ; extra == "cli"
27
+ Requires-Dist: tgt (>=1.5,<2.0) ; extra == "cli"
28
+ Requires-Dist: torch (>=2.4.1,<3.0.0)
29
+ Requires-Dist: torchaudio (>=2.10.0,<3.0.0) ; extra == "cli"
30
+ Requires-Dist: torchaudio (>=2.4.1,<3.0.0) ; extra == "cli"
31
+ Requires-Dist: torchcodec (>=0.0.1,<1.0.0) ; extra == "cli"
32
+ Requires-Dist: torchcodec (>=0.8.0,<1.0.0) ; extra == "cli"
33
+ Requires-Dist: transformers (>=4.19.0,<6.0.0)
34
+ Requires-Dist: typer (>=0.23.0,<0.24.0) ; extra == "cli"
35
+ Requires-Dist: typer (>=0.23.0,<0.24.0) ; extra == "cli"
@@ -0,0 +1,49 @@
1
+ [project]
2
+ name = "zerosyl"
3
+ version = "1.0.0"
4
+ description = "Simple zero-resource syllable tokenization for spoken language modeling"
5
+ authors = [
6
+ {name = "nicolvisser",email = "vissernicol@gmail.com"}
7
+ ]
8
+ license = {text = "MIT"}
9
+ requires-python = ">=3.11,<3.15"
10
+ dependencies = [
11
+ "torch (>=2.4.1,<3.0.0)",
12
+ "numpy (>=2.0.0,<3.0.0)",
13
+ "scipy (>=1.13.0,<2.0.0)",
14
+ "faiss-cpu (>=1.8.0,<2.0.0)",
15
+ "transformers (>=4.19.0,<6.0.0)"
16
+ ]
17
+
18
+ [project.optional-dependencies]
19
+ cli = [
20
+ "typer (>=0.23.0,<0.24.0)",
21
+ "torchcodec (>=0.0.1,<1.0.0)",
22
+ "tgt (>=1.5,<2.0)",
23
+ "pandas (>=2.0.0,<4.0.0)",
24
+ "torchaudio (>=2.10.0,<3.0.0)",
25
+ "matplotlib (>=3.10.8,<4.0.0)",
26
+ "pydub (>=0.25.1,<0.26.0)",
27
+ "typer (>=0.23.0,<0.24.0)",
28
+ "torchcodec (>=0.8.0,<1.0.0)",
29
+ "tgt (>=1.5,<2.0)",
30
+ "pandas (>=2.2.2,<4.0.0)",
31
+ "torchaudio (>=2.4.1,<3.0.0)",
32
+ "matplotlib (>=3.9.0,<4.0.0)",
33
+ "scikit-learn (>=1.8.0,<2.0.0)"
34
+ ]
35
+
36
+ [build-system]
37
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
38
+ build-backend = "poetry.core.masonry.api"
39
+
40
+ [dependency-groups]
41
+ dev = [
42
+ "ipykernel (>=7.2.0,<8.0.0)",
43
+ "isort (>=7.0.0,<8.0.0)",
44
+ "black (>=26.1.0,<27.0.0)",
45
+ "wandb (>=0.25.0,<0.26.0)"
46
+ ]
47
+
48
+ [project.scripts]
49
+ zerosyl = "zerosyl.cli:app"
@@ -0,0 +1,13 @@
1
+ from .lm import LanguageModel, OPTConfig
2
+ from .wavlm.WavLM import WavLM, WavLMConfig
3
+ from .zerosyl import ZeroSylCollapsed, ZeroSylContinuous, ZeroSylDiscrete
4
+
5
+ __all__ = [
6
+ "WavLM",
7
+ "WavLMConfig",
8
+ "ZeroSylCollapsed",
9
+ "ZeroSylContinuous",
10
+ "ZeroSylDiscrete",
11
+ "LanguageModel",
12
+ "OPTConfig",
13
+ ]
@@ -0,0 +1,331 @@
1
+ import shutil
2
+ import sys
3
+ from pathlib import Path
4
+ from typing import Literal
5
+
6
+ import torch
7
+
8
+ try:
9
+ import typer
10
+ from typing_extensions import Annotated # Typer often uses this
11
+ except ImportError:
12
+ print("Error: The ZeroSyl CLI components are not installed.")
13
+ print("\nPlease reinstall with the [cli] extra:")
14
+ print(' pip install "zerosyl[cli]"')
15
+ print("\nOr if you want everything:")
16
+ print(' pip install "zerosyl[all]"')
17
+ sys.exit(1)
18
+
19
+
20
+ def check_dependencies():
21
+ """Verify that FFmpeg is installed and accessible."""
22
+ # if shutil.which("ffmpeg") is None:
23
+ if True:
24
+ typer.secho(
25
+ "\n[!] ERROR: FFmpeg not found.",
26
+ fg=typer.colors.WHITE,
27
+ bg=typer.colors.RED,
28
+ bold=True,
29
+ err=True,
30
+ )
31
+ typer.echo("\nZeroSyl requires FFmpeg for audio processing.", err=True)
32
+ install_cmd = typer.style(
33
+ "sudo apt install ffmpeg", fg=typer.colors.CYAN, italic=True
34
+ )
35
+ typer.echo(f"Please install it using: {install_cmd}", err=True)
36
+ raise typer.Exit(code=1)
37
+
38
+
39
+ app = typer.Typer()
40
+
41
+ eval_app = typer.Typer(help="Evaluation utilities")
42
+ app.add_typer(eval_app, name="evaluate")
43
+
44
+
45
+ @app.command()
46
+ def encode(
47
+ input_dir: Path = typer.Argument(
48
+ ...,
49
+ help="Input directory with audio.",
50
+ exists=True,
51
+ file_okay=False,
52
+ dir_okay=True,
53
+ ),
54
+ output_dir: Path = typer.Argument(..., help="Output directory for segments."),
55
+ input_pattern: str = typer.Option(
56
+ "*.wav", help="Glob pattern to match input audio files."
57
+ ),
58
+ output_format: Literal["continuous", "discrete", "collapsed"] = typer.Option(
59
+ "collapsed",
60
+ "--output",
61
+ help='Choose output type: "continuous" or "discrete" or "collapsed".',
62
+ ),
63
+ device: str = typer.Option(
64
+ "cuda" if torch.cuda.is_available() else "cpu",
65
+ help="Device to run WavLM on (e.g., 'cuda' or 'cpu').",
66
+ ),
67
+ batch_size: int = typer.Option(
68
+ None,
69
+ min=1,
70
+ help="Batch size for encoding. If set, a parallel job is started, which is helpful for large datasets. Adjust based on your GPU memory.",
71
+ ),
72
+ num_data_workers: int = typer.Option(
73
+ 4,
74
+ min=1,
75
+ help="Number of data workers for loading data if running a parallel job.",
76
+ ),
77
+ num_logic_workers: int = typer.Option(
78
+ 1,
79
+ min=1,
80
+ help="Number of logic workers for boundary detection if running a parallel job.",
81
+ ),
82
+ ):
83
+ """
84
+ Segment waveforms in a directory into syllabic segments and save the output as .pt files. Must have ffmpeg installed on your system.
85
+
86
+ -----------------------------------------------------------------------------
87
+
88
+ - If --output is [green]continuous[/green], save the continuous representations.
89
+ Each output file is a dictionary with keys: "starts", "ends" and "embeddings".
90
+ "starts" and "ends" are long tensors containing frame indices (divide by 50 Hz to get ).
91
+ "embeddings" is a float32 tensor with one 1024-dimensional embedding per segment.
92
+
93
+ - If --output is [green]discrete[/green] save the segments with a K-means cluster ID (one of 10,000 types) associated with each segment. One of 10,000 values.
94
+ Each output file is a torch.long tensor with shape [num_segments, 3]. The columns are: start frame index, end frame index, cluster ID.
95
+
96
+ - If --output is [green]collapsed[/green] save the segments with an K-means cluster ID (one of 9,116 types) associated with each segment, where all the centroids that correspond to silences are mapped to a single id, SIL=9115.
97
+ Each output file is a torch.long tensor with shape [num_segments, 3]. The columns are: start frame index, end frame index, unit ID.
98
+
99
+ -----------------------------------------------------------------------------
100
+
101
+ """
102
+ check_dependencies()
103
+
104
+ from zerosyl.utils.encode import encode, encode_parallel
105
+
106
+ if batch_size is None:
107
+ # run single-threaded encoding
108
+
109
+ encode(
110
+ input_dir=input_dir,
111
+ output_dir=output_dir,
112
+ input_pattern=input_pattern,
113
+ output_format=output_format,
114
+ device=device,
115
+ )
116
+ else:
117
+ # run parallel encoding
118
+ encode_parallel(
119
+ input_dir=input_dir,
120
+ output_dir=output_dir,
121
+ input_pattern=input_pattern,
122
+ output_format=output_format,
123
+ device=device,
124
+ batch_size=batch_size,
125
+ num_data_workers=num_data_workers,
126
+ num_logic_workers=num_logic_workers,
127
+ )
128
+
129
+
130
+ @eval_app.command()
131
+ def boundaries(
132
+ segments_dir: Path = typer.Argument(
133
+ ..., help="Directory containing segment files."
134
+ ),
135
+ textgrid_dir: Path = typer.Argument(
136
+ ..., help="Directory containing TextGrid files."
137
+ ),
138
+ segments_pattern: str = typer.Option(
139
+ "dev*/**/*.pt", help="Glob pattern for segment files."
140
+ ),
141
+ textgrid_pattern: str = typer.Option(
142
+ "dev*/**/*.TextGrid", help="Glob pattern for TextGrid files."
143
+ ),
144
+ frame_rate: float = typer.Option(50.0, help="Frame rate (Hz)."),
145
+ constant_shift: float = typer.Option(
146
+ 0.0, help="Constant time shift applied to boundaries (in seconds)."
147
+ ),
148
+ tolerance: float = typer.Option(
149
+ 0.05, help="Tolerance for matching boundaries (in seconds)."
150
+ ),
151
+ ):
152
+ """
153
+ Compute the clustering metrics of a segmentation.
154
+ """
155
+ from zerosyl.eval.boundaries import evaluate_boundary_metrics
156
+
157
+ if constant_shift == 0.0:
158
+ typer.echo(
159
+ "INFO: You started the boundary evaluation without supplying a constant shift."
160
+ )
161
+ typer.echo("Just remember, in our paper we tuned the constant shift.")
162
+
163
+ precision, recall, f1, os, rvalue, token_precision, token_recall, token_f1 = (
164
+ evaluate_boundary_metrics(
165
+ segments_dir=segments_dir,
166
+ textgrid_dir=textgrid_dir,
167
+ segments_pattern=segments_pattern,
168
+ textgrid_pattern=textgrid_pattern,
169
+ frame_rate=frame_rate,
170
+ constant_shift=constant_shift,
171
+ tolerance=tolerance,
172
+ )
173
+ )
174
+
175
+ typer.echo(f"Precision {precision*100:8.2f}")
176
+ typer.echo(f"Recall {recall*100:8.2f}")
177
+ typer.echo(f"F1 {f1*100:8.2f}")
178
+ typer.echo(f"OS {os*100:8.2f}")
179
+ typer.echo(f"R-value {rvalue*100:8.2f}")
180
+ typer.echo()
181
+ typer.echo(f"Token Precision {token_precision*100:8.2f}")
182
+ typer.echo(f"Token Recall {token_recall*100:8.2f}")
183
+ typer.echo(f"Token F1 {token_f1*100:8.2f}")
184
+ typer.echo()
185
+
186
+
187
+ @eval_app.command()
188
+ def bitrate(
189
+ segments_dir: Path = typer.Argument(
190
+ ..., help="Directory containing segment files."
191
+ ),
192
+ textgrid_dir: Path = typer.Argument(..., help="Directory with TextGrid files."),
193
+ segments_pattern: str = typer.Option(
194
+ "dev*/**/*.pt", help="Pattern for segment files."
195
+ ),
196
+ textgrid_pattern: str = typer.Option(
197
+ "dev*/**/*.TextGrid", help="Pattern for TextGrid files."
198
+ ),
199
+ vocab_size: int = typer.Option(
200
+ None, help="Optional vocab size. Otherwise, will infer the vocab size."
201
+ ),
202
+ ):
203
+ """
204
+ Compute the bitrate and frequecy of a segmentation.
205
+ """
206
+ from zerosyl.eval.bitrate import evaluate_bitrate_and_freq
207
+
208
+ bitrate_value, freq_value = evaluate_bitrate_and_freq(
209
+ segments_dir=segments_dir,
210
+ textgrid_dir=textgrid_dir,
211
+ segments_pattern=segments_pattern,
212
+ textgrid_pattern=textgrid_pattern,
213
+ vocab_size=vocab_size,
214
+ )
215
+
216
+ typer.echo(f"Bitrate: {bitrate_value:10.2f} bits/s")
217
+ typer.echo(f"Frequency: {freq_value:10.2f} Hz")
218
+
219
+
220
+ @eval_app.command()
221
+ def clustering(
222
+ segments_dir: Path = typer.Argument(
223
+ ..., help="Directory containing segment files."
224
+ ),
225
+ textgrid_dir: Path = typer.Argument(..., help="Directory with TextGrid files."),
226
+ segments_pattern: str = typer.Option(
227
+ "dev*/**/*.pt", help="Pattern for segment files."
228
+ ),
229
+ textgrid_pattern: str = typer.Option(
230
+ "dev*/**/*.TextGrid", help="Pattern for TextGrid files."
231
+ ),
232
+ ):
233
+ """
234
+ Compute the clustering metrics of a segmentation.
235
+ """
236
+ from zerosyl.eval.clustering import evaluate_clustering_metrics
237
+
238
+ per_cluster_purity, per_syllable_purity, snmi = evaluate_clustering_metrics(
239
+ segments_dir=segments_dir,
240
+ textgrid_dir=textgrid_dir,
241
+ segments_pattern=segments_pattern,
242
+ textgrid_pattern=textgrid_pattern,
243
+ )
244
+
245
+ typer.echo(f"Per-cluster purity: {per_cluster_purity:10.4f}")
246
+ typer.echo(f"Per-syllable purity: {per_syllable_purity:10.4f}")
247
+ typer.echo(f"Syllable-normalized mutual info: {snmi:10.4f}")
248
+
249
+
250
+ @eval_app.command()
251
+ def loglikelihoods(
252
+ segments_dir: Path = typer.Argument(
253
+ ...,
254
+ exists=True,
255
+ file_okay=False,
256
+ dir_okay=True,
257
+ help="Directory containing .pt segment files.",
258
+ ),
259
+ output_path: Path = typer.Argument(
260
+ ...,
261
+ help="Where to write the log-likelihood text file.",
262
+ ),
263
+ checkpoint_path: str = typer.Option(
264
+ None,
265
+ help="Optional path or URL to the model checkpoint. Leave empty to download the default model remotely.",
266
+ ),
267
+ batch_size: int = typer.Option(
268
+ 64,
269
+ min=1,
270
+ help="Batch size for evaluation.",
271
+ ),
272
+ num_workers: int = typer.Option(
273
+ 8,
274
+ min=0,
275
+ help="Number of dataloader worker processes.",
276
+ ),
277
+ segments_pattern: str = typer.Option(
278
+ "*.pt", help="Glob pattern to match segment files."
279
+ ),
280
+ normalize: bool = typer.Option(
281
+ False, "--normalize", help="Normalize the loglikelihoods by token count"
282
+ ),
283
+ ):
284
+ """
285
+ Compute log-likelihoods for a directory of unit segment files using a ULM checkpoint.
286
+
287
+ - Output is a text file where each line contains: <segment-file-stem> <log-likelihood>.
288
+
289
+ - Supports optional checkpoint: if none is provided, the model is downloaded remotely.
290
+ """
291
+
292
+ from zerosyl.eval.loglikelihood import compute_loglikelihoods
293
+
294
+ if output_path.suffix.lower() != ".txt":
295
+ raise typer.BadParameter("output_path must have a .txt extension")
296
+
297
+ compute_loglikelihoods(
298
+ segments_dir=segments_dir,
299
+ checkpoint_path=checkpoint_path,
300
+ output_path=output_path,
301
+ batch_size=batch_size,
302
+ num_workers=num_workers,
303
+ segments_pattern=segments_pattern,
304
+ normalize=normalize,
305
+ )
306
+
307
+
308
+ @eval_app.command()
309
+ def tsc(
310
+ loglikelihoods_file: Path = typer.Argument(
311
+ ...,
312
+ exists=True,
313
+ file_okay=True,
314
+ dir_okay=False,
315
+ help="Path to the log-likelihoods .txt file.",
316
+ )
317
+ ):
318
+ """
319
+ Compute the TSC metric from a log-likelihoods file.
320
+
321
+ - The log-likelihoods file should be a text file where each line contains: <segment-file-stem> <log-likelihood>.
322
+ """
323
+
324
+ from zerosyl.eval.tsc import eval_tsc
325
+
326
+ if loglikelihoods_file.suffix.lower() != ".txt":
327
+ raise typer.BadParameter("loglikelihoods_file must have a .txt extension")
328
+
329
+ score = eval_tsc(loglikelihoods_file)
330
+
331
+ typer.echo(f"TSC Score: {score:.4f}")
@@ -0,0 +1,58 @@
1
+ from pathlib import Path
2
+
3
+ import numpy as np
4
+ import tgt
5
+ import torch
6
+ from rich.progress import track
7
+
8
+
9
+ def evaluate_bitrate_and_freq(
10
+ segments_dir: str | Path,
11
+ textgrid_dir: str | Path,
12
+ segments_pattern: str = "dev*/**/*.pt",
13
+ textgrid_pattern: str = "dev*/**/*.TextGrid",
14
+ vocab_size: int | None = None,
15
+ ):
16
+ segments_dir = Path(segments_dir)
17
+ textgrid_dir = Path(textgrid_dir)
18
+
19
+ segments_paths = sorted(segments_dir.glob(segments_pattern))
20
+ textgrid_paths = sorted(textgrid_dir.glob(textgrid_pattern))
21
+
22
+ assert len(segments_paths) > 0
23
+ assert len(textgrid_paths) > 0
24
+ assert len(segments_paths) == len(textgrid_paths)
25
+ for sp, tp in zip(segments_paths, textgrid_paths):
26
+ assert sp.stem == tp.stem
27
+
28
+ all_ids = []
29
+ all_durations = []
30
+
31
+ for sp, tp in track(
32
+ zip(segments_paths, textgrid_paths),
33
+ description="Calculating...",
34
+ total=len(segments_paths),
35
+ ):
36
+ # fetch ids from segment file
37
+ segments = torch.load(sp)
38
+ starts, ends, ids = segments.T.numpy()
39
+ all_ids.append(ids)
40
+
41
+ # fetch duration from textgrid
42
+ textgrid = tgt.read_textgrid(tp, include_empty_intervals=True)
43
+ duration = textgrid.end_time - textgrid.start_time
44
+ all_durations.append(duration)
45
+
46
+ all_ids = np.concat(all_ids, axis=0)
47
+ total_duration = np.sum(all_durations)
48
+
49
+ if vocab_size is None:
50
+ vocab_size = all_ids.max() + 1
51
+ counts = np.bincount(all_ids, minlength=vocab_size)
52
+ probs = counts / sum(counts) + 1e-10
53
+ entropy = -np.sum(probs * np.log2(probs))
54
+ bitrate = len(all_ids) * entropy / total_duration
55
+
56
+ freq = len(all_ids) / total_duration
57
+
58
+ return bitrate, freq